Roll latest libvpx into Android.

Make the VP9 decoding 2X faster than the old one. Checkout is from master branch(hash:242157c756314827ad9244952c7253e8900b9626). Change-Id: Ibe67b3ee19f82b87df2416826b63a67f7f79b63a
author: hkuang <hkuang@google.com> 2013-07-25 11:11:39 -0700
committer: hkuang <hkuang@google.com> 2013-07-25 12:03:12 -0700
commit: 91037db265ecdd914a26e056cf69207b4f50924e (patch)
tree: c78c618cf6d0ffb187e2734d524bca19698b3c0d
parent: ba164dffc5a6795bce97fae02b51ccf3330e15e4 (diff)
download: android_external_libvpx-91037db265ecdd914a26e056cf69207b4f50924e.tar.gz
android_external_libvpx-91037db265ecdd914a26e056cf69207b4f50924e.tar.bz2
android_external_libvpx-91037db265ecdd914a26e056cf69207b4f50924e.zip
232 files changed, 21557 insertions, 17417 deletions
diff --git a/armv7a-neon/libvpx_srcs.txt b/armv7a-neon/libvpx_srcs.txt
index 15973e2..7f331c0 100644
--- a/armv7a-neon/libvpx_srcs.txt
+++ b/armv7a-neon/libvpx_srcs.txt
@@ -119,7 +119,6 @@ vp8/common/treecoder.c
 vp8/common/treecoder.h
 vp8/common/variance_c.c
 vp8/common/variance.h
-vp8/common/vp8_asm_com_offsets.c
 vp8/common/vp8_entropymodedata.h
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
@@ -133,7 +132,6 @@ vp8/decoder/onyxd_if.c
 vp8/decoder/onyxd_int.h
 vp8/decoder/threading.c
 vp8/decoder/treereader.h
-vp8/decoder/vp8_asm_dec_offsets.c
 vp8/encoder/arm/armv5te/boolhuff_armv5te.asm.s
 vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm.s
 vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm.s
@@ -205,11 +203,18 @@ vp8/vp8_cx_iface.c
 vp8/vp8cx.mk
 vp8/vp8_dx_iface.c
 vp8/vp8dx.mk
+vp9/common/arm/neon/vp9_convolve8_avg_neon.asm.s
+vp9/common/arm/neon/vp9_convolve8_neon.asm.s
+vp9/common/arm/neon/vp9_convolve_neon.c
+vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm.s
+vp9/common/arm/neon/vp9_loopfilter_neon.asm.s
+vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm.s
 vp9/common/generic/vp9_systemdependent.c
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
-vp9/common/vp9_asm_com_offsets.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common_data.c
+vp9/common/vp9_common_data.h
 vp9/common/vp9_common.h
 vp9/common/vp9_convolve.c
 vp9/common/vp9_convolve.h
@@ -233,10 +238,6 @@ vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
 vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
-vp9/common/vp9_mbpitch.c
-vp9/common/vp9_modecont.c
-vp9/common/vp9_modecontext.c
-vp9/common/vp9_modecont.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
 vp9/common/vp9_mvref_common.h
@@ -264,7 +265,7 @@ vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
 vp9/common/vp9_treecoder.c
 vp9/common/vp9_treecoder.h
-vp9/decoder/vp9_asm_dec_offsets.c
+vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm.s
 vp9/decoder/vp9_dboolhuff.c
 vp9/decoder/vp9_dboolhuff.h
 vp9/decoder/vp9_decodemv.c
@@ -273,6 +274,8 @@ vp9/decoder/vp9_decodframe.c
 vp9/decoder/vp9_decodframe.h
 vp9/decoder/vp9_detokenize.c
 vp9/decoder/vp9_detokenize.h
+vp9/decoder/vp9_dsubexp.c
+vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_idct_blk.c
 vp9/decoder/vp9_idct_blk.h
 vp9/decoder/vp9_onyxd.h
diff --git a/armv7a-neon/vp9_rtcd.h b/armv7a-neon/vp9_rtcd.h
index cc3c834..6e6ff71 100644
--- a/armv7a-neon/vp9_rtcd.h
+++ b/armv7a-neon/vp9_rtcd.h
@@ -38,53 +38,195 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
 void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
 #define vp9_idct_add_32x32 vp9_idct_add_32x32_c
 
-void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem16x16 vp9_copy_mem16x16_c
+void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
 
-void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x8 vp9_copy_mem8x8_c
+void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
 
-void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x4 vp9_copy_mem8x4_c
+void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
 
-void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available);
-#define vp9_build_intra_predictors vp9_build_intra_predictors_c
+void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
 
-void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c
+void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
 
-void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c
+void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
 
-void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride);
-#define vp9_intra4x4_predict vp9_intra4x4_predict_c
+void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
+
+void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
+
+void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
+
+void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
+
+void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
+
+void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
+
+void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
+
+void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+
+void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
+
+void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
+
+void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
+
+void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
+
+void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
+
+void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
+
+void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
+
+void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
+
+void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
+
+void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
+
+void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
+
+void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
+
+void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+
+void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
+
+void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
+
+void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
+
+void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
+
+void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
+
+void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
+
+void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
+
+void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
+
+void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
+
+void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
+
+void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
+
+void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
+
+void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+
+void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
+
+void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
+
+void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+
+void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
+
+void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
+
+void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
+
+void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
+
+void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
+
+void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
+
+void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
+
+void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
+
+void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
 
 void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
-#define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c
+void vp9_add_constant_residual_8x8_neon(const int16_t diff, uint8_t *dest, int stride);
+#define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_neon
 
 void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest, int stride);
-#define vp9_add_constant_residual_16x16 vp9_add_constant_residual_16x16_c
+void vp9_add_constant_residual_16x16_neon(const int16_t diff, uint8_t *dest, int stride);
+#define vp9_add_constant_residual_16x16 vp9_add_constant_residual_16x16_neon
 
 void vp9_add_constant_residual_32x32_c(const int16_t diff, uint8_t *dest, int stride);
-#define vp9_add_constant_residual_32x32 vp9_add_constant_residual_32x32_c
+void vp9_add_constant_residual_32x32_neon(const int16_t diff, uint8_t *dest, int stride);
+#define vp9_add_constant_residual_32x32 vp9_add_constant_residual_32x32_neon
 
 void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 #define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_c
 
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_c
+void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_neon
 
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
+void vp9_loop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_neon
 
-void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_c
+void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_neon
 
 void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_c
+void vp9_loop_filter_horizontal_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_neon
 
 void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_mb_inner vp9_blend_mb_inner_c
@@ -95,23 +237,35 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in
 void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_b vp9_blend_b_c
 
-void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8 vp9_convolve8_c
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_copy vp9_convolve_copy_c
 
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_horiz vp9_convolve8_horiz_c
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_avg vp9_convolve_avg_c
 
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_vert vp9_convolve8_vert_c
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve8 vp9_convolve8_neon
 
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg vp9_convolve8_avg_c
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve8_horiz vp9_convolve8_horiz_neon
 
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve8_vert vp9_convolve8_vert_neon
 
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve8_avg vp9_convolve8_avg_neon
+
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_neon
+
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_neon
 
 void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct4x4_1_add vp9_short_idct4x4_1_add_c
@@ -120,7 +274,8 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct4x4_add vp9_short_idct4x4_add_c
 
 void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct8x8_add vp9_short_idct8x8_add_c
+void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct8x8_add vp9_short_idct8x8_add_neon
 
 void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct10_8x8_add vp9_short_idct10_8x8_add_c
@@ -158,9 +313,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx
 void vp9_idct4_1d_c(int16_t *input, int16_t *output);
 #define vp9_idct4_1d vp9_idct4_1d_c
 
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride);
-#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c
-
 void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c
 
diff --git a/armv7a-neon/vpx_config.h b/armv7a-neon/vpx_config.h
index a808f7c..6f45f7e 100644
--- a/armv7a-neon/vpx_config.h
+++ b/armv7a-neon/vpx_config.h
@@ -87,5 +87,4 @@
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
-#define CONFIG_BALANCED_COEFTREE 0
 #endif /* VPX_CONFIG_H */
diff --git a/armv7a-neon/vpx_scale_rtcd.h b/armv7a-neon/vpx_scale_rtcd.h
index ed84626..9972777 100644
--- a/armv7a-neon/vpx_scale_rtcd.h
+++ b/armv7a-neon/vpx_scale_rtcd.h
@@ -45,6 +45,9 @@ void vp8_yv12_copy_y_neon(struct yv12_buffer_config *src_ybc, struct yv12_buffer
 void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
 #define vp9_extend_frame_borders vp9_extend_frame_borders_c
 
+void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
+#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+
 void vpx_scale_rtcd(void);
 #include "vpx_config.h"
 
diff --git a/armv7a/libvpx_srcs.txt b/armv7a/libvpx_srcs.txt
index bab4901..a929dc3 100644
--- a/armv7a/libvpx_srcs.txt
+++ b/armv7a/libvpx_srcs.txt
@@ -88,7 +88,6 @@ vp8/common/treecoder.c
 vp8/common/treecoder.h
 vp8/common/variance_c.c
 vp8/common/variance.h
-vp8/common/vp8_asm_com_offsets.c
 vp8/common/vp8_entropymodedata.h
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
@@ -102,7 +101,6 @@ vp8/decoder/onyxd_if.c
 vp8/decoder/onyxd_int.h
 vp8/decoder/threading.c
 vp8/decoder/treereader.h
-vp8/decoder/vp8_asm_dec_offsets.c
 vp8/encoder/arm/armv5te/boolhuff_armv5te.asm.s
 vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm.s
 vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm.s
@@ -170,8 +168,9 @@ vp8/vp8dx.mk
 vp9/common/generic/vp9_systemdependent.c
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
-vp9/common/vp9_asm_com_offsets.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common_data.c
+vp9/common/vp9_common_data.h
 vp9/common/vp9_common.h
 vp9/common/vp9_convolve.c
 vp9/common/vp9_convolve.h
@@ -195,10 +194,6 @@ vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
 vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
-vp9/common/vp9_mbpitch.c
-vp9/common/vp9_modecont.c
-vp9/common/vp9_modecontext.c
-vp9/common/vp9_modecont.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
 vp9/common/vp9_mvref_common.h
@@ -226,7 +221,6 @@ vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
 vp9/common/vp9_treecoder.c
 vp9/common/vp9_treecoder.h
-vp9/decoder/vp9_asm_dec_offsets.c
 vp9/decoder/vp9_dboolhuff.c
 vp9/decoder/vp9_dboolhuff.h
 vp9/decoder/vp9_decodemv.c
@@ -235,6 +229,8 @@ vp9/decoder/vp9_decodframe.c
 vp9/decoder/vp9_decodframe.h
 vp9/decoder/vp9_detokenize.c
 vp9/decoder/vp9_detokenize.h
+vp9/decoder/vp9_dsubexp.c
+vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_idct_blk.c
 vp9/decoder/vp9_idct_blk.h
 vp9/decoder/vp9_onyxd.h
diff --git a/armv7a/vp9_rtcd.h b/armv7a/vp9_rtcd.h
index cc3c834..d6b244d 100644
--- a/armv7a/vp9_rtcd.h
+++ b/armv7a/vp9_rtcd.h
@@ -38,26 +38,161 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
 void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
 #define vp9_idct_add_32x32 vp9_idct_add_32x32_c
 
-void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem16x16 vp9_copy_mem16x16_c
+void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
 
-void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x8 vp9_copy_mem8x8_c
+void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
 
-void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x4 vp9_copy_mem8x4_c
+void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
 
-void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available);
-#define vp9_build_intra_predictors vp9_build_intra_predictors_c
+void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
 
-void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c
+void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
 
-void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c
+void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
 
-void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride);
-#define vp9_intra4x4_predict vp9_intra4x4_predict_c
+void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
+
+void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
+
+void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
+
+void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
+
+void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
+
+void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
+
+void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
+
+void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+
+void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
+
+void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
+
+void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
+
+void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
+
+void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
+
+void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
+
+void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
+
+void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
+
+void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
+
+void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
+
+void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
+
+void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
+
+void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+
+void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
+
+void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
+
+void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
+
+void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
+
+void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
+
+void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
+
+void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
+
+void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
+
+void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
+
+void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
+
+void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
+
+void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
+
+void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+
+void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
+
+void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
+
+void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+
+void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
+
+void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
+
+void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
+
+void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
+
+void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
+
+void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
+
+void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
+
+void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
+
+void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
 
 void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
 #define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c
@@ -77,7 +212,7 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *bli
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
 
-void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
@@ -95,22 +230,28 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in
 void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_b vp9_blend_b_c
 
-void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_copy vp9_convolve_copy_c
+
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_avg vp9_convolve_avg_c
+
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8 vp9_convolve8_c
 
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_horiz vp9_convolve8_horiz_c
 
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_vert vp9_convolve8_vert_c
 
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_avg vp9_convolve8_avg_c
 
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c
 
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c
 
 void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
@@ -158,9 +299,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx
 void vp9_idct4_1d_c(int16_t *input, int16_t *output);
 #define vp9_idct4_1d vp9_idct4_1d_c
 
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride);
-#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c
-
 void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c
 
diff --git a/armv7a/vpx_config.h b/armv7a/vpx_config.h
index e04f103..be08d2a 100644
--- a/armv7a/vpx_config.h
+++ b/armv7a/vpx_config.h
@@ -87,5 +87,4 @@
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
-#define CONFIG_BALANCED_COEFTREE 0
 #endif /* VPX_CONFIG_H */
diff --git a/armv7a/vpx_scale_rtcd.h b/armv7a/vpx_scale_rtcd.h
index 3f25632..d4212f2 100644
--- a/armv7a/vpx_scale_rtcd.h
+++ b/armv7a/vpx_scale_rtcd.h
@@ -42,6 +42,9 @@ void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_co
 void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
 #define vp9_extend_frame_borders vp9_extend_frame_borders_c
 
+void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
+#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+
 void vpx_scale_rtcd(void);
 #include "vpx_config.h"
 
diff --git a/generic/libvpx_srcs.txt b/generic/libvpx_srcs.txt
index 8c1ec80..402ac24 100644
--- a/generic/libvpx_srcs.txt
+++ b/generic/libvpx_srcs.txt
@@ -60,7 +60,6 @@ vp8/common/treecoder.c
 vp8/common/treecoder.h
 vp8/common/variance_c.c
 vp8/common/variance.h
-vp8/common/vp8_asm_com_offsets.c
 vp8/common/vp8_entropymodedata.h
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
@@ -74,7 +73,6 @@ vp8/decoder/onyxd_if.c
 vp8/decoder/onyxd_int.h
 vp8/decoder/threading.c
 vp8/decoder/treereader.h
-vp8/decoder/vp8_asm_dec_offsets.c
 vp8/encoder/bitstream.c
 vp8/encoder/bitstream.h
 vp8/encoder/block.h
@@ -130,8 +128,9 @@ vp8/vp8dx.mk
 vp9/common/generic/vp9_systemdependent.c
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
-vp9/common/vp9_asm_com_offsets.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common_data.c
+vp9/common/vp9_common_data.h
 vp9/common/vp9_common.h
 vp9/common/vp9_convolve.c
 vp9/common/vp9_convolve.h
@@ -155,10 +154,6 @@ vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
 vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
-vp9/common/vp9_mbpitch.c
-vp9/common/vp9_modecont.c
-vp9/common/vp9_modecontext.c
-vp9/common/vp9_modecont.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
 vp9/common/vp9_mvref_common.h
@@ -186,7 +181,6 @@ vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
 vp9/common/vp9_treecoder.c
 vp9/common/vp9_treecoder.h
-vp9/decoder/vp9_asm_dec_offsets.c
 vp9/decoder/vp9_dboolhuff.c
 vp9/decoder/vp9_dboolhuff.h
 vp9/decoder/vp9_decodemv.c
@@ -195,6 +189,8 @@ vp9/decoder/vp9_decodframe.c
 vp9/decoder/vp9_decodframe.h
 vp9/decoder/vp9_detokenize.c
 vp9/decoder/vp9_detokenize.h
+vp9/decoder/vp9_dsubexp.c
+vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_idct_blk.c
 vp9/decoder/vp9_idct_blk.h
 vp9/decoder/vp9_onyxd.h
diff --git a/generic/vp9_rtcd.h b/generic/vp9_rtcd.h
index dee08d4..c0824cb 100644
--- a/generic/vp9_rtcd.h
+++ b/generic/vp9_rtcd.h
@@ -38,26 +38,161 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
 void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
 #define vp9_idct_add_32x32 vp9_idct_add_32x32_c
 
-void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem16x16 vp9_copy_mem16x16_c
+void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
 
-void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x8 vp9_copy_mem8x8_c
+void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
 
-void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x4 vp9_copy_mem8x4_c
+void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
 
-void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available);
-#define vp9_build_intra_predictors vp9_build_intra_predictors_c
+void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
 
-void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c
+void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
 
-void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c
+void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
 
-void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride);
-#define vp9_intra4x4_predict vp9_intra4x4_predict_c
+void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
+
+void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
+
+void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
+
+void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
+
+void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
+
+void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
+
+void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
+
+void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+
+void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
+
+void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
+
+void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
+
+void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
+
+void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
+
+void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
+
+void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
+
+void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
+
+void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
+
+void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
+
+void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
+
+void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
+
+void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+
+void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
+
+void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
+
+void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
+
+void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
+
+void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
+
+void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
+
+void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
+
+void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
+
+void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
+
+void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
+
+void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
+
+void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
+
+void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+
+void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
+
+void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
+
+void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+
+void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
+
+void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
+
+void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
+
+void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
+
+void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
+
+void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
+
+void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
+
+void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
+
+void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
 
 void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
 #define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c
@@ -77,7 +212,7 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *bli
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
 
-void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
@@ -95,22 +230,28 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in
 void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_b vp9_blend_b_c
 
-void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_copy vp9_convolve_copy_c
+
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_avg vp9_convolve_avg_c
+
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8 vp9_convolve8_c
 
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_horiz vp9_convolve8_horiz_c
 
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_vert vp9_convolve8_vert_c
 
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_avg vp9_convolve8_avg_c
 
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c
 
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c
 
 void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
@@ -158,9 +299,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx
 void vp9_idct4_1d_c(int16_t *input, int16_t *output);
 #define vp9_idct4_1d vp9_idct4_1d_c
 
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride);
-#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c
-
 void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c
 
diff --git a/generic/vpx_config.h b/generic/vpx_config.h
index 44e6842..37dcff9 100644
--- a/generic/vpx_config.h
+++ b/generic/vpx_config.h
@@ -87,5 +87,4 @@
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
-#define CONFIG_BALANCED_COEFTREE 0
 #endif /* VPX_CONFIG_H */
diff --git a/generic/vpx_scale_rtcd.h b/generic/vpx_scale_rtcd.h
index 3a1db05..c2842ee 100644
--- a/generic/vpx_scale_rtcd.h
+++ b/generic/vpx_scale_rtcd.h
@@ -42,6 +42,9 @@ void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_co
 void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
 #define vp9_extend_frame_borders vp9_extend_frame_borders_c
 
+void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
+#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+
 void vpx_scale_rtcd(void);
 #include "vpx_config.h"
 
diff --git a/libvpx.mk b/libvpx.mk
index 197ed75..ec8e69a 100644
--- a/libvpx.mk
+++ b/libvpx.mk
@@ -60,14 +60,10 @@ LOCAL_SRC_FILES += $(libvpx_target)/vpx_config.c
 # used yet but are included in the comments for future reference.
 
 libvpx_asm_offsets_intermediates := \
-    vp8/common/vp8_asm_com_offsets.intermediate \
-    vp8/decoder/vp8_asm_dec_offsets.intermediate \
     vp8/encoder/vp8_asm_enc_offsets.intermediate \
     vpx_scale/vpx_scale_asm_offsets.intermediate \
 
 libvpx_asm_offsets_files := \
-    vp8/common/vp8_asm_com_offsets.asm \
-    vp8/decoder/vp8_asm_dec_offsets.asm \
     vp8/encoder/vp8_asm_enc_offsets.asm \
     vpx_scale/vpx_scale_asm_offsets.asm \
 
diff --git a/libvpx/README b/libvpx/README
index 0475dad..92cc074 100644
--- a/libvpx/README
+++ b/libvpx/README
@@ -97,7 +97,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
 
   5. Configuration errors
   If the configuration step fails, the first step is to look in the error log.
-  This defaults to config.err. This should give a good indication of what went
+  This defaults to config.log. This should give a good indication of what went
   wrong. If not, contact us for support.
 
 SUPPORT
diff --git a/libvpx/build/arm-msvs/obj_int_extract.bat b/libvpx/build/arm-msvs/obj_int_extract.bat
index 147342d..7fd16a3 100644
--- a/libvpx/build/arm-msvs/obj_int_extract.bat
+++ b/libvpx/build/arm-msvs/obj_int_extract.bat
@@ -7,18 +7,7 @@ REM   in the file PATENTS.  All contributing project authors may
 REM   be found in the AUTHORS file in the root of the source tree.
 echo on
 
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/common/vp9_asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/decoder/vp9_asm_dec_offsets.c"
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/encoder/vp9_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm"
-obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm"
-obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm"
-
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/common/vp8_asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/decoder/vp8_asm_dec_offsets.c"
 cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/encoder/vp8_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
-obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
 obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
 
 cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vpx_scale/vpx_scale_asm_offsets.c"
diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh
index ee4493d..30a6106 100755
--- a/libvpx/build/make/configure.sh
+++ b/libvpx/build/make/configure.sh
@@ -75,7 +75,7 @@ Options:
 
 Build options:
   --help                      print this message
-  --log=yes|no|FILE           file configure log is written to [config.err]
+  --log=yes|no|FILE           file configure log is written to [config.log]
   --target=TARGET             target platform tuple [generic-gnu]
   --cpu=CPU                   optimize for a specific cpu rather than a family
   --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
@@ -653,6 +653,10 @@ process_common_toolchain() {
                 tgt_isa=x86_64
                 tgt_os=darwin12
                 ;;
+            *darwin13*)
+                tgt_isa=x86_64
+                tgt_os=darwin13
+                ;;
             x86_64*mingw32*)
                 tgt_os=win64
                 ;;
@@ -751,6 +755,10 @@ process_common_toolchain() {
             add_cflags  "-mmacosx-version-min=10.8"
             add_ldflags "-mmacosx-version-min=10.8"
             ;;
+        *-darwin13-*)
+            add_cflags  "-mmacosx-version-min=10.9"
+            add_ldflags "-mmacosx-version-min=10.9"
+            ;;
     esac
 
     # Handle Solaris variants. Solaris 10 needs -lposix4
@@ -1296,7 +1304,7 @@ process_detect() {
 }
 
 enable logging
-logfile="config.err"
+logfile="config.log"
 self=$0
 process() {
     cmdline_args="$@"
diff --git a/libvpx/build/make/gen_msvs_proj.sh b/libvpx/build/make/gen_msvs_proj.sh
index cff27c8..fc5011b 100755
--- a/libvpx/build/make/gen_msvs_proj.sh
+++ b/libvpx/build/make/gen_msvs_proj.sh
@@ -381,7 +381,7 @@ generate_vcproj() {
                             RuntimeLibrary="$debug_runtime" \
                             UsePrecompiledHeader="0" \
                             WarningLevel="3" \
-                            DebugInformationFormat="1" \
+                            DebugInformationFormat="2" \
                             $warn_64bit \
 
                         $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
@@ -395,7 +395,7 @@ generate_vcproj() {
                             RuntimeLibrary="$debug_runtime" \
                             UsePrecompiledHeader="0" \
                             WarningLevel="3" \
-                            DebugInformationFormat="1" \
+                            DebugInformationFormat="2" \
                             $warn_64bit \
 
                         $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
diff --git a/libvpx/build/make/gen_msvs_sln.sh b/libvpx/build/make/gen_msvs_sln.sh
index 5a8c793..f9fc694 100755
--- a/libvpx/build/make/gen_msvs_sln.sh
+++ b/libvpx/build/make/gen_msvs_sln.sh
@@ -74,8 +74,13 @@ parse_project() {
 
     # assume that all projects have the same list of possible configurations,
     # so overwriting old config_lists is not a problem
-    config_list=`grep -A1 '<Configuration' $file |
-        grep Name | cut -d\" -f2`
+    if [ "$sfx" = "vcproj" ]; then
+        config_list=`grep -A1 '<Configuration' $file |
+            grep Name | cut -d\" -f2`
+    else
+        config_list=`grep -B1 'Label="Configuration"' $file |
+            grep Condition | cut -d\' -f4`
+    fi
     proj_list="${proj_list} ${var}"
 }
 
@@ -168,9 +173,14 @@ process_makefile() {
     IFS=$'\r'$'\n'
     local TAB=$'\t'
     cat <<EOF
-found_devenv := \$(shell which devenv.com >/dev/null 2>&1 && echo yes)
+ifeq (\$(CONFIG_VS_VERSION),7)
+MSBUILD_TOOL := devenv.com
+else
+MSBUILD_TOOL := msbuild.exe
+endif
+found_devenv := \$(shell which \$(MSBUILD_TOOL) >/dev/null 2>&1 && echo yes)
 .nodevenv.once:
-${TAB}@echo "  * devenv.com not found in path."
+${TAB}@echo "  * \$(MSBUILD_TOOL) not found in path."
 ${TAB}@echo "  * "
 ${TAB}@echo "  * You will have to build all configurations manually using the"
 ${TAB}@echo "  * Visual Studio IDE. To allow make to build them automatically,"
@@ -195,16 +205,17 @@ ${TAB}rm -rf "$platform"/"$config"
 ifneq (\$(found_devenv),)
   ifeq (\$(CONFIG_VS_VERSION),7)
 $nows_sln_config: $outfile
-${TAB}devenv.com $outfile -build "$config"
+${TAB}\$(MSBUILD_TOOL) $outfile -build "$config"
 
   else
 $nows_sln_config: $outfile
-${TAB}devenv.com $outfile -build "$sln_config"
+${TAB}\$(MSBUILD_TOOL) $outfile -m -t:Build \\
+${TAB}${TAB}-p:Configuration="$config" -p:Platform="$platform"
 
   endif
 else
 $nows_sln_config: $outfile .nodevenv.once
-${TAB}@echo "  * Skipping build of $sln_config (devenv.com not in path)."
+${TAB}@echo "  * Skipping build of $sln_config (\$(MSBUILD_TOOL) not in path)."
 ${TAB}@echo "  * "
 endif
 
diff --git a/libvpx/build/make/obj_int_extract.c b/libvpx/build/make/obj_int_extract.c
index 1604b5e..feed9d9 100644
--- a/libvpx/build/make/obj_int_extract.c
+++ b/libvpx/build/make/obj_int_extract.c
@@ -38,7 +38,21 @@ int log_msg(const char *fmt, ...) {
 #include <mach-o/loader.h>
 #include <mach-o/nlist.h>
 
-int parse_macho(uint8_t *base_buf, size_t sz) {
+int print_macho_equ(output_fmt_t mode, uint8_t* name, int val) {
+  switch (mode) {
+    case OUTPUT_FMT_RVDS:
+      printf("%-40s EQU %5d\n", name, val);
+      return 0;
+    case  OUTPUT_FMT_GAS:
+      printf(".set %-40s, %5d\n", name, val);
+      return 0;
+    default:
+      log_msg("Unsupported mode: %d", mode);
+      return 1;
+  }
+}
+
+int parse_macho(uint8_t *base_buf, size_t sz, output_fmt_t mode) {
   int i, j;
   struct mach_header header;
   uint8_t *buf = base_buf;
@@ -156,8 +170,7 @@ int parse_macho(uint8_t *base_buf, size_t sz) {
 
             memcpy(&val, base_buf + base_data_section + nl.n_value,
                    sizeof(val));
-            printf("%-40s EQU %5d\n",
-                   str_buf + nl.n_un.n_strx + 1, val);
+            print_macho_equ(mode, str_buf + nl.n_un.n_strx + 1, val);
           } else { /* if (bits == 64) */
             struct nlist_64 nl;
             int val;
@@ -167,8 +180,7 @@ int parse_macho(uint8_t *base_buf, size_t sz) {
 
             memcpy(&val, base_buf + base_data_section + nl.n_value,
                    sizeof(val));
-            printf("%-40s EQU %5d\n",
-                   str_buf + nl.n_un.n_strx + 1, val);
+            print_macho_equ(mode, str_buf + nl.n_un.n_strx + 1, val);
           }
         }
       }
@@ -796,7 +808,7 @@ int main(int argc, char **argv) {
 
 #if defined(__GNUC__) && __GNUC__
 #if defined(__MACH__)
-  res = parse_macho(file_buf, file_size);
+  res = parse_macho(file_buf, file_size, mode);
 #elif defined(__ELF__)
   res = parse_elf(file_buf, file_size, mode);
 #endif
diff --git a/libvpx/build/x86-msvs/obj_int_extract.bat b/libvpx/build/x86-msvs/obj_int_extract.bat
index 47fef97..4e9b0ec 100644
--- a/libvpx/build/x86-msvs/obj_int_extract.bat
+++ b/libvpx/build/x86-msvs/obj_int_extract.bat
@@ -7,17 +7,6 @@ REM   in the file PATENTS.  All contributing project authors may
 REM   be found in the AUTHORS file in the root of the source tree.
 echo on
 
-cl /I "./" /I "%1" /nologo /c "%1/vp9/common/vp9_asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp9/decoder/vp9_asm_dec_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp9/encoder/vp9_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm"
-obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm"
-obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm"
-
-cl /I "./" /I "%1" /nologo /c "%1/vp8/common/vp8_asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/vp8_asm_dec_offsets.c"
 cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/vp8_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
-obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
 obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
 
diff --git a/libvpx/configure b/libvpx/configure
index 28676fb..3651334 100755
--- a/libvpx/configure
+++ b/libvpx/configure
@@ -115,6 +115,7 @@ all_platforms="${all_platforms} x86-darwin9-icc"
 all_platforms="${all_platforms} x86-darwin10-gcc"
 all_platforms="${all_platforms} x86-darwin11-gcc"
 all_platforms="${all_platforms} x86-darwin12-gcc"
+all_platforms="${all_platforms} x86-darwin13-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
 all_platforms="${all_platforms} x86-os2-gcc"
@@ -129,6 +130,7 @@ all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-darwin11-gcc"
 all_platforms="${all_platforms} x86_64-darwin12-gcc"
+all_platforms="${all_platforms} x86_64-darwin13-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
@@ -142,6 +144,7 @@ all_platforms="${all_platforms} universal-darwin9-gcc"
 all_platforms="${all_platforms} universal-darwin10-gcc"
 all_platforms="${all_platforms} universal-darwin11-gcc"
 all_platforms="${all_platforms} universal-darwin12-gcc"
+all_platforms="${all_platforms} universal-darwin13-gcc"
 all_platforms="${all_platforms} generic-gnu"
 
 # all_targets is a list of all targets that can be configured
@@ -247,7 +250,6 @@ EXPERIMENT_LIST="
     multiple_arf
     non420
     alpha
-    balanced_coeftree
 "
 CONFIG_LIST="
     external_build
@@ -682,6 +684,14 @@ process_toolchain() {
             # iOS/ARM builds do not work with gtest. This does not match
             # x86 targets.
         ;;
+        *-win*)
+            # Some mingw toolchains don't have pthread available by default.
+            # Treat these more like visual studio where threading in gtest
+            # would be disabled for the same reason.
+            check_cxx "$@" <<EOF && soft_enable unit_tests
+int z;
+EOF
+        ;;
         *)
             enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests
 int z;
diff --git a/libvpx/libs.mk b/libvpx/libs.mk
index f7ed95b..4aa7dc4 100644
--- a/libvpx/libs.mk
+++ b/libvpx/libs.mk
@@ -202,6 +202,7 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS)
 libvpx_srcs.txt:
 	@echo "    [CREATE] $@"
 	@echo $(CODEC_SRCS) | xargs -n1 echo | sort -u > $@
+CLEAN-OBJS += libvpx_srcs.txt
 
 
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
@@ -382,6 +383,11 @@ LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
                      $(call enabled,LIBVPX_TEST_DATA))
 libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)
 
+libvpx_test_srcs.txt:
+	@echo "    [CREATE] $@"
+	@echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | sort -u > $@
+CLEAN-OBJS += libvpx_test_srcs.txt
+
 $(LIBVPX_TEST_DATA):
 	@echo "    [DOWNLOAD] $@"
 	$(qexec)trap 'rm -f $@' INT TERM &&\
@@ -442,6 +448,10 @@ else
 include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk
 GTEST_SRCS := $(addprefix third_party/googletest/src/,$(call enabled,GTEST_SRCS))
 GTEST_OBJS=$(call objs,$(GTEST_SRCS))
+ifeq ($(filter win%,$(TGT_OS)),$(TGT_OS))
+# Disabling pthreads globally will cause issues on darwin and possibly elsewhere
+$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -DGTEST_HAS_PTHREAD=0
+endif
 $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
 $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
 OBJS-$(BUILD_LIBVPX) += $(GTEST_OBJS)
@@ -466,7 +476,7 @@ $(foreach bin,$(LIBVPX_TEST_BINS),\
         lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a ))\
     $(if $(BUILD_LIBVPX),$(eval $(call linkerxx_template,$(bin),\
         $(LIBVPX_TEST_OBJS) \
-        -L. -lvpx -lgtest -lpthread -lm)\
+        -L. -lvpx -lgtest $(extralibs) -lm)\
         )))\
     $(if $(LIPO_LIBS),$(eval $(call lipo_bin_template,$(bin))))\
 
diff --git a/libvpx/test/altref_test.cc b/libvpx/test/altref_test.cc
index 14af265..af25b72 100644
--- a/libvpx/test/altref_test.cc
+++ b/libvpx/test/altref_test.cc
@@ -33,10 +33,6 @@ class AltRefTest : public ::libvpx_test::EncoderTest,
     altref_count_ = 0;
   }
 
-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
   virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                   libvpx_test::Encoder *encoder) {
     if (video->frame() == 1) {
diff --git a/libvpx/test/borders_test.cc b/libvpx/test/borders_test.cc
index 49505ee..7bfece8 100644
--- a/libvpx/test/borders_test.cc
+++ b/libvpx/test/borders_test.cc
@@ -27,10 +27,6 @@ class BordersTest : public ::libvpx_test::EncoderTest,
     SetMode(GET_PARAM(1));
   }
 
-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
     if ( video->frame() == 1) {
diff --git a/libvpx/test/codec_factory.h b/libvpx/test/codec_factory.h
index fdae572..cc7b53f 100644
--- a/libvpx/test/codec_factory.h
+++ b/libvpx/test/codec_factory.h
@@ -134,14 +134,14 @@ class VP8CodecFactory : public CodecFactory {
 
 const libvpx_test::VP8CodecFactory kVP8;
 
-#define VP8_INSTANTIATE_TEST_CASE(test, params)\
+#define VP8_INSTANTIATE_TEST_CASE(test, ...)\
   INSTANTIATE_TEST_CASE_P(VP8, test, \
       ::testing::Combine( \
           ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
               &libvpx_test::kVP8)), \
-          params))
+          __VA_ARGS__))
 #else
-#define VP8_INSTANTIATE_TEST_CASE(test, params)
+#define VP8_INSTANTIATE_TEST_CASE(test, ...)
 #endif  // CONFIG_VP8
 
 
@@ -216,14 +216,14 @@ class VP9CodecFactory : public CodecFactory {
 
 const libvpx_test::VP9CodecFactory kVP9;
 
-#define VP9_INSTANTIATE_TEST_CASE(test, params)\
+#define VP9_INSTANTIATE_TEST_CASE(test, ...)\
   INSTANTIATE_TEST_CASE_P(VP9, test, \
       ::testing::Combine( \
           ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
                &libvpx_test::kVP9)), \
-          params))
+          __VA_ARGS__))
 #else
-#define VP9_INSTANTIATE_TEST_CASE(test, params)
+#define VP9_INSTANTIATE_TEST_CASE(test, ...)
 #endif  // CONFIG_VP9
 
 
diff --git a/libvpx/test/config_test.cc b/libvpx/test/config_test.cc
index 9008728..36c6330 100644
--- a/libvpx/test/config_test.cc
+++ b/libvpx/test/config_test.cc
@@ -40,10 +40,6 @@ class ConfigTest : public ::libvpx_test::EncoderTest,
     ++frame_count_out_;
   }
 
-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
   unsigned int frame_count_in_;
   unsigned int frame_count_out_;
   unsigned int frame_count_max_;
diff --git a/libvpx/test/convolve_test.cc b/libvpx/test/convolve_test.cc
index fd2bd36..3b72129 100644
--- a/libvpx/test/convolve_test.cc
+++ b/libvpx/test/convolve_test.cc
@@ -22,8 +22,8 @@ extern "C" {
 }
 
 namespace {
-typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int filter_x_stride,
                               const int16_t *filter_y, int filter_y_stride,
                               int w, int h);
@@ -211,7 +211,7 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
 
   virtual void SetUp() {
     UUT_ = GET_PARAM(2);
-    /* Set up guard blocks for an inner block cetered in the outer block */
+    /* Set up guard blocks for an inner block centered in the outer block */
     for (int i = 0; i < kOutputBufferSize; ++i) {
       if (IsIndexInBorder(i))
         output_[i] = 255;
@@ -546,4 +546,26 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
     make_tuple(32, 64, &convolve8_ssse3),
     make_tuple(64, 64, &convolve8_ssse3)));
 #endif
+
+#if HAVE_NEON
+const ConvolveFunctions convolve8_neon(
+    vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
+    vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
+    vp9_convolve8_neon, vp9_convolve8_avg_neon);
+
+INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_neon),
+    make_tuple(8, 4, &convolve8_neon),
+    make_tuple(4, 8, &convolve8_neon),
+    make_tuple(8, 8, &convolve8_neon),
+    make_tuple(16, 8, &convolve8_neon),
+    make_tuple(8, 16, &convolve8_neon),
+    make_tuple(16, 16, &convolve8_neon),
+    make_tuple(32, 16, &convolve8_neon),
+    make_tuple(16, 32, &convolve8_neon),
+    make_tuple(32, 32, &convolve8_neon),
+    make_tuple(64, 32, &convolve8_neon),
+    make_tuple(32, 64, &convolve8_neon),
+    make_tuple(64, 64, &convolve8_neon)));
+#endif
 }  // namespace
diff --git a/libvpx/test/cpu_speed_test.cc b/libvpx/test/cpu_speed_test.cc
new file mode 100644
index 0000000..e6ad75b
--- /dev/null
+++ b/libvpx/test/cpu_speed_test.cc
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class CpuSpeedTest : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWith2Params<
+        libvpx_test::TestMode, int> {
+ protected:
+  CpuSpeedTest() : EncoderTest(GET_PARAM(0)) {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
+    }
+  }
+  int set_cpu_used_;
+};
+
+TEST_P(CpuSpeedTest, TestQ0) {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 400;
+  cfg_.rc_max_quantizer = 0;
+  cfg_.rc_min_quantizer = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       20);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+
+TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 12000;
+  cfg_.rc_max_quantizer = 10;
+  cfg_.rc_min_quantizer = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+TEST_P(CpuSpeedTest, TestLowBitrate) {
+  // Validate that this clip encodes and decodes without a mismatch
+  // when passing in a very high min q.  This pushes the encoder to producing
+  // lots of small partitions which might will test the other condition.
+
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.rc_min_quantizer = 40;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+using std::tr1::make_tuple;
+
+#define VP9_FACTORY \
+  static_cast<const libvpx_test::CodecFactory*> (&libvpx_test::kVP9)
+
+VP9_INSTANTIATE_TEST_CASE(
+    CpuSpeedTest,
+    ::testing::Values(::libvpx_test::kTwoPassGood),
+    ::testing::Range(0, 3));
+}  // namespace
diff --git a/libvpx/test/cq_test.cc b/libvpx/test/cq_test.cc
index a6a4b8e..a2c8291 100644
--- a/libvpx/test/cq_test.cc
+++ b/libvpx/test/cq_test.cc
@@ -42,10 +42,6 @@ class CQTest : public ::libvpx_test::EncoderTest,
     n_frames_ = 0;
   }
 
-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
   virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                   libvpx_test::Encoder *encoder) {
     if (video->frame() == 1) {
diff --git a/libvpx/test/datarate_test.cc b/libvpx/test/datarate_test.cc
index 85eeafb..287e805 100644
--- a/libvpx/test/datarate_test.cc
+++ b/libvpx/test/datarate_test.cc
@@ -36,10 +36,6 @@ class DatarateTest : public ::libvpx_test::EncoderTest,
     duration_ = 0.0;
   }
 
-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
     const vpx_rational_t tb = video->timebase();
diff --git a/libvpx/test/dct16x16_test.cc b/libvpx/test/dct16x16_test.cc
index 9fb45d6..0795054 100644
--- a/libvpx/test/dct16x16_test.cc
+++ b/libvpx/test/dct16x16_test.cc
@@ -13,6 +13,7 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx_ports/mem.h"
 
 extern "C" {
 #include "vp9/common/vp9_entropy.h"
@@ -264,59 +265,79 @@ void reference_16x16_dct_2d(int16_t input[16*16], double output[16*16]) {
   }
 }
 
+void fdct16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+               int stride, int /*tx_type*/) {
+  vp9_short_fdct16x16_c(in, out, stride);
+}
+void idct16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                   int stride, int /*tx_type*/) {
+  vp9_short_idct16x16_add_c(out, dst, stride >> 1);
+}
+void fht16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+              int stride, int tx_type) {
+  // FIXME(jingning): need to test both SSE2 and c
+#if HAVE_SSE2
+  vp9_short_fht16x16_sse2(in, out, stride >> 1, tx_type);
+#else
+  vp9_short_fht16x16_c(in, out, stride >> 1, tx_type);
+#endif
+}
+void iht16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+  vp9_short_iht16x16_add_c(out, dst, stride >> 1, tx_type);
+}
 
-TEST(VP9Idct16x16Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t in[256], coeff[256];
-    uint8_t dst[256], src[256];
-    double out_r[256];
-
-    for (int j = 0; j < 256; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
+class FwdTrans16x16Test : public ::testing::TestWithParam<int> {
+ public:
+  virtual ~FwdTrans16x16Test() {}
+
+  virtual void SetUp() {
+    tx_type_ = GetParam();
+    if (tx_type_ == 0) {
+      fwd_txfm = fdct16x16;
+      inv_txfm = idct16x16_add;
+    } else {
+      fwd_txfm = fht16x16;
+      inv_txfm = iht16x16_add;
     }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j)
-      in[j] = src[j] - dst[j];
+  }
 
-    reference_16x16_dct_2d(in, out_r);
-    for (int j = 0; j < 256; j++)
-      coeff[j] = round(out_r[j]);
-    vp9_short_idct16x16_add_c(coeff, dst, 16);
-    for (int j = 0; j < 256; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      EXPECT_GE(1, error)
-          << "Error: 16x16 IDCT has error " << error
-          << " at index " << j;
-    }
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+    (*fwd_txfm)(in, out, dst, stride, tx_type);
   }
-}
+  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+    (*inv_txfm)(in, out, dst, stride, tx_type);
+  }
+
+  int tx_type_;
+  void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+  void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+};
 
-// we need enable fdct test once we re-do the 16 point fdct.
-TEST(VP9Fdct16x16Test, AccuracyCheck) {
+TEST_P(FwdTrans16x16Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int max_error = 0;
   double total_error = 0;
-  const int count_test_block = 1000;
+  const int count_test_block = 10000;
   for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[256];
-    int16_t test_temp_block[256];
-    uint8_t dst[256], src[256];
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 256);
 
     for (int j = 0; j < 256; ++j) {
       src[j] = rnd.Rand8();
       dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j)
+      // Initialize a test block with input range [-255, 255].
       test_input_block[j] = src[j] - dst[j];
+    }
 
     const int pitch = 32;
-    vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct16x16_add_c(test_temp_block, dst, 16);
+    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
 
     for (int j = 0; j < 256; ++j) {
       const int diff = dst[j] - src[j];
@@ -328,18 +349,21 @@ TEST(VP9Fdct16x16Test, AccuracyCheck) {
   }
 
   EXPECT_GE(1, max_error)
-      << "Error: 16x16 FDCT/IDCT has an individual round trip error > 1";
+      << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
 
   EXPECT_GE(count_test_block , total_error)
-      << "Error: 16x16 FDCT/IDCT has average round trip error > 1 per block";
+      << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
 }
 
-TEST(VP9Fdct16x16Test, CoeffSizeCheck) {
+TEST_P(FwdTrans16x16Test, CoeffSizeCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 1000;
   for (int i = 0; i < count_test_block; ++i) {
-    int16_t input_block[256], input_extreme_block[256];
-    int16_t output_block[256], output_extreme_block[256];
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_extreme_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
 
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 256; ++j) {
@@ -351,16 +375,50 @@ TEST(VP9Fdct16x16Test, CoeffSizeCheck) {
         input_extreme_block[j] = 255;
 
     const int pitch = 32;
-    vp9_short_fdct16x16_c(input_block, output_block, pitch);
-    vp9_short_fdct16x16_c(input_extreme_block, output_extreme_block, pitch);
+    RunFwdTxfm(input_block, output_block, dst, pitch, tx_type_);
+    RunFwdTxfm(input_extreme_block, output_extreme_block, dst, pitch, tx_type_);
 
     // The minimum quant value is 4.
     for (int j = 0; j < 256; ++j) {
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
           << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
-          << "Error: 16x16 FDCT extreme has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_extreme_block[j]))
+          << "Error: 16x16 FDCT extreme has coefficient larger "
+          << "than 4*DCT_MAX_VALUE";
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(VP9, FwdTrans16x16Test, ::testing::Range(0, 4));
+
+TEST(VP9Idct16x16Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t in[256], coeff[256];
+    uint8_t dst[256], src[256];
+    double out_r[256];
+
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 256; ++j)
+      in[j] = src[j] - dst[j];
+
+    reference_16x16_dct_2d(in, out_r);
+    for (int j = 0; j < 256; j++)
+      coeff[j] = round(out_r[j]);
+    vp9_short_idct16x16_add_c(coeff, dst, 16);
+    for (int j = 0; j < 256; ++j) {
+      const int diff = dst[j] - src[j];
+      const int error = diff * diff;
+      EXPECT_GE(1, error)
+          << "Error: 16x16 IDCT has error " << error
+          << " at index " << j;
     }
   }
 }
+
 }  // namespace
diff --git a/libvpx/test/encode_test_driver.h b/libvpx/test/encode_test_driver.h
index 6aeb96b..dbdc33c 100644
--- a/libvpx/test/encode_test_driver.h
+++ b/libvpx/test/encode_test_driver.h
@@ -190,7 +190,9 @@ class EncoderTest {
   virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {}
 
   // Hook to determine whether the encode loop should continue.
-  virtual bool Continue() const { return !abort_; }
+  virtual bool Continue() const {
+    return !(::testing::Test::HasFatalFailure() || abort_);
+  }
 
   const CodecFactory   *codec_;
   // Hook to determine whether to decode frame after encoding
diff --git a/libvpx/test/error_resilience_test.cc b/libvpx/test/error_resilience_test.cc
index ddfbd0f..d4a6967 100644
--- a/libvpx/test/error_resilience_test.cc
+++ b/libvpx/test/error_resilience_test.cc
@@ -50,10 +50,6 @@ class ErrorResilienceTest : public ::libvpx_test::EncoderTest,
     mismatch_nframes_ = 0;
   }
 
-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
   virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
diff --git a/libvpx/test/fdct4x4_test.cc b/libvpx/test/fdct4x4_test.cc
index 1c887bb..9dcc078 100644
--- a/libvpx/test/fdct4x4_test.cc
+++ b/libvpx/test/fdct4x4_test.cc
@@ -20,29 +20,75 @@ extern "C" {
 
 #include "acm_random.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
+void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+             int stride, int /*tx_type*/) {
+  vp9_short_fdct4x4_c(in, out, stride);
+}
+void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                 int stride, int /*tx_type*/) {
+  vp9_short_idct4x4_add_c(out, dst, stride >> 1);
+}
+void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+            int stride, int tx_type) {
+  vp9_short_fht4x4_c(in, out, stride >> 1, tx_type);
+}
+void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                int stride, int tx_type) {
+  vp9_short_iht4x4_add_c(out, dst, stride >> 1, tx_type);
+}
+
+class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
+ public:
+  virtual ~FwdTrans4x4Test() {}
+  virtual void SetUp() {
+    tx_type_ = GetParam();
+    if (tx_type_ == 0) {
+      fwd_txfm_ = fdct4x4;
+      inv_txfm_ = idct4x4_add;
+    } else {
+      fwd_txfm_ = fht4x4;
+      inv_txfm_ = iht4x4_add;
+    }
+  }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+    (*fwd_txfm_)(in, out, dst, stride, tx_type);
+  }
+
+  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+    (*inv_txfm_)(in, out, dst, stride, tx_type);
+  }
+
+  int tx_type_;
+  void (*fwd_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
+                   int stride, int tx_type);
+  void (*inv_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
+                   int stride, int tx_type);
+};
 
-TEST(Vp9Fdct4x4Test, SignBiasCheck) {
+TEST_P(FwdTrans4x4Test, SignBiasCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int16_t test_input_block[16];
-  int16_t test_output_block[16];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 16);
   const int pitch = 8;
   int count_sign_block[16][2];
   const int count_test_block = 1000000;
 
   memset(count_sign_block, 0, sizeof(count_sign_block));
-
   for (int i = 0; i < count_test_block; ++i) {
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 16; ++j)
       test_input_block[j] = rnd.Rand8() - rnd.Rand8();
 
-    // TODO(Yaowu): this should be converted to a parameterized test
-    // to test optimized versions of this function.
-    vp9_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
 
     for (int j = 0; j < 16; ++j) {
       if (test_output_block[j] < 0)
@@ -56,20 +102,18 @@ TEST(Vp9Fdct4x4Test, SignBiasCheck) {
     const bool bias_acceptable = (abs(count_sign_block[j][0] -
                                       count_sign_block[j][1]) < 10000);
     EXPECT_TRUE(bias_acceptable)
-        << "Error: 4x4 FDCT has a sign bias > 1%"
-        << " for input range [-255, 255] at index " << j;
+        << "Error: 4x4 FDCT/FHT has a sign bias > 1%"
+        << " for input range [-255, 255] at index " << j
+        << " tx_type " << tx_type_;
   }
 
   memset(count_sign_block, 0, sizeof(count_sign_block));
-
   for (int i = 0; i < count_test_block; ++i) {
     // Initialize a test block with input range [-15, 15].
     for (int j = 0; j < 16; ++j)
       test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
 
-    // TODO(Yaowu): this should be converted to a parameterized test
-    // to test optimized versions of this function.
-    vp9_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
 
     for (int j = 0; j < 16; ++j) {
       if (test_output_block[j] < 0)
@@ -83,20 +127,22 @@ TEST(Vp9Fdct4x4Test, SignBiasCheck) {
     const bool bias_acceptable = (abs(count_sign_block[j][0] -
                                       count_sign_block[j][1]) < 100000);
     EXPECT_TRUE(bias_acceptable)
-        << "Error: 4x4 FDCT has a sign bias > 10%"
+        << "Error: 4x4 FDCT/FHT has a sign bias > 10%"
         << " for input range [-15, 15] at index " << j;
   }
-};
+}
 
-TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
+TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
+
   int max_error = 0;
   double total_error = 0;
   const int count_test_block = 1000000;
   for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[16];
-    int16_t test_temp_block[16];
-    uint8_t dst[16], src[16];
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 16);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 16);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 16);
 
     for (int j = 0; j < 16; ++j) {
       src[j] = rnd.Rand8();
@@ -106,10 +152,8 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
     for (int j = 0; j < 16; ++j)
       test_input_block[j] = src[j] - dst[j];
 
-    // TODO(Yaowu): this should be converted to a parameterized test
-    // to test optimized versions of this function.
     const int pitch = 8;
-    vp9_short_fdct4x4_c(test_input_block, test_temp_block, pitch);
+    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
 
     for (int j = 0; j < 16; ++j) {
         if(test_temp_block[j] > 0) {
@@ -123,8 +167,8 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
         }
     }
 
-    // Because the bitstream is not frozen yet, use the idct in the codebase.
-    vp9_short_idct4x4_add_c(test_temp_block, dst, 4);
+    // inverse transform and reconstruct the pixel block
+    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
 
     for (int j = 0; j < 16; ++j) {
       const int diff = dst[j] - src[j];
@@ -135,10 +179,12 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
     }
   }
   EXPECT_GE(1, max_error)
-      << "Error: FDCT/IDCT has an individual roundtrip error > 1";
+      << "Error: FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1";
 
   EXPECT_GE(count_test_block, total_error)
-      << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
-};
+      << "Error: FDCT/IDCT or FHT/IHT has average "
+      << "roundtrip error > 1 per block";
+}
 
+INSTANTIATE_TEST_CASE_P(VP9, FwdTrans4x4Test, ::testing::Range(0, 4));
 }  // namespace
diff --git a/libvpx/test/fdct8x8_test.cc b/libvpx/test/fdct8x8_test.cc
index 90b4ecd..50e2e9d 100644
--- a/libvpx/test/fdct8x8_test.cc
+++ b/libvpx/test/fdct8x8_test.cc
@@ -13,6 +13,7 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx_ports/mem.h"
 
 extern "C" {
 #include "vp9_rtcd.h"
@@ -25,11 +26,62 @@ void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
 using libvpx_test::ACMRandom;
 
 namespace {
+void fdct8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+             int stride, int /*tx_type*/) {
+  vp9_short_fdct8x8_c(in, out, stride);
+}
+void idct8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                 int stride, int /*tx_type*/) {
+  vp9_short_idct8x8_add_c(out, dst, stride >> 1);
+}
+void fht8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+            int stride, int tx_type) {
+  // TODO(jingning): need to refactor this to test both _c and _sse2 functions,
+  // when we have all inverse dct functions done sse2.
+#if HAVE_SSE2
+  vp9_short_fht8x8_sse2(in, out, stride >> 1, tx_type);
+#else
+  vp9_short_fht8x8_c(in, out, stride >> 1, tx_type);
+#endif
+}
+void iht8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                int stride, int tx_type) {
+  vp9_short_iht8x8_add_c(out, dst, stride >> 1, tx_type);
+}
+
+class FwdTrans8x8Test : public ::testing::TestWithParam<int> {
+ public:
+  virtual ~FwdTrans8x8Test() {}
+  virtual void SetUp() {
+    tx_type_ = GetParam();
+    if (tx_type_ == 0) {
+      fwd_txfm = fdct8x8;
+      inv_txfm = idct8x8_add;
+    } else {
+      fwd_txfm = fht8x8;
+      inv_txfm = iht8x8_add;
+    }
+  }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+    (*fwd_txfm)(in, out, dst, stride, tx_type);
+  }
+  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+    (*inv_txfm)(in, out, dst, stride, tx_type);
+  }
+
+  int tx_type_;
+  void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+  void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+};
 
-TEST(VP9Fdct8x8Test, SignBiasCheck) {
+TEST_P(FwdTrans8x8Test, SignBiasCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int16_t test_input_block[64];
-  int16_t test_output_block[64];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);
   const int pitch = 16;
   int count_sign_block[64][2];
   const int count_test_block = 100000;
@@ -41,7 +93,7 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
     for (int j = 0; j < 64; ++j)
       test_input_block[j] = rnd.Rand8() - rnd.Rand8();
 
-    vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch);
+    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
 
     for (int j = 0; j < 64; ++j) {
       if (test_output_block[j] < 0)
@@ -55,7 +107,7 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
     const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
     const int max_diff = 1125;
     EXPECT_LT(diff, max_diff)
-        << "Error: 8x8 FDCT has a sign bias > "
+        << "Error: 8x8 FDCT/FHT has a sign bias > "
         << 1. * max_diff / count_test_block * 100 << "%"
         << " for input range [-255, 255] at index " << j
         << " count0: " << count_sign_block[j][0]
@@ -70,7 +122,7 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
     for (int j = 0; j < 64; ++j)
       test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
 
-    vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch);
+    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
 
     for (int j = 0; j < 64; ++j) {
       if (test_output_block[j] < 0)
@@ -84,24 +136,25 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
     const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
     const int max_diff = 10000;
     EXPECT_LT(diff, max_diff)
-        << "Error: 4x4 FDCT has a sign bias > "
+        << "Error: 4x4 FDCT/FHT has a sign bias > "
         << 1. * max_diff / count_test_block * 100 << "%"
         << " for input range [-15, 15] at index " << j
         << " count0: " << count_sign_block[j][0]
         << " count1: " << count_sign_block[j][1]
         << " diff: " << diff;
   }
-};
+}
 
-TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
+TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int max_error = 0;
   double total_error = 0;
   const int count_test_block = 100000;
   for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[64];
-    int16_t test_temp_block[64];
-    uint8_t dst[64], src[64];
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
 
     for (int j = 0; j < 64; ++j) {
       src[j] = rnd.Rand8();
@@ -112,7 +165,7 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
       test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 16;
-    vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
+    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
     for (int j = 0; j < 64; ++j){
         if(test_temp_block[j] > 0) {
           test_temp_block[j] += 2;
@@ -124,7 +177,7 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
           test_temp_block[j] *= 4;
         }
     }
-    vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
+    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
 
     for (int j = 0; j < 64; ++j) {
       const int diff = dst[j] - src[j];
@@ -136,21 +189,23 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
   }
 
   EXPECT_GE(1, max_error)
-      << "Error: 8x8 FDCT/IDCT has an individual roundtrip error > 1";
+    << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1";
 
   EXPECT_GE(count_test_block/5, total_error)
-      << "Error: 8x8 FDCT/IDCT has average roundtrip error > 1/5 per block";
-};
+    << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
+        "error > 1/5 per block";
+}
 
-TEST(VP9Fdct8x8Test, ExtremalCheck) {
+TEST_P(FwdTrans8x8Test, ExtremalCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int max_error = 0;
   double total_error = 0;
   const int count_test_block = 100000;
   for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[64];
-    int16_t test_temp_block[64];
-    uint8_t dst[64], src[64];
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
 
     for (int j = 0; j < 64; ++j) {
       src[j] = rnd.Rand8() % 2 ? 255 : 0;
@@ -161,8 +216,8 @@ TEST(VP9Fdct8x8Test, ExtremalCheck) {
       test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 16;
-    vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
+    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
 
     for (int j = 0; j < 64; ++j) {
       const int diff = dst[j] - src[j];
@@ -173,13 +228,14 @@ TEST(VP9Fdct8x8Test, ExtremalCheck) {
     }
 
     EXPECT_GE(1, max_error)
-        << "Error: Extremal 8x8 FDCT/IDCT has an"
+        << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has an"
         << " individual roundtrip error > 1";
 
     EXPECT_GE(count_test_block/5, total_error)
-        << "Error: Extremal 8x8 FDCT/IDCT has average"
+        << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
         << " roundtrip error > 1/5 per block";
   }
-};
+}
 
+INSTANTIATE_TEST_CASE_P(VP9, FwdTrans8x8Test, ::testing::Range(0, 4));
 }  // namespace
diff --git a/libvpx/test/i420_video_source.h b/libvpx/test/i420_video_source.h
index 12a6ab1..bcbe8a7 100644
--- a/libvpx/test/i420_video_source.h
+++ b/libvpx/test/i420_video_source.h
@@ -49,7 +49,7 @@ class I420VideoSource : public VideoSource {
     if (input_file_)
       fclose(input_file_);
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
         << file_name_;
     if (start_) {
       fseek(input_file_, raw_sz_ * start_, SEEK_SET);
@@ -92,6 +92,7 @@ class I420VideoSource : public VideoSource {
   }
 
   virtual void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
     // Read a frame from input_file.
     if (fread(img_->img_data, raw_sz_, 1, input_file_) == 0) {
       limit_ = frame_;
@@ -108,8 +109,8 @@ class I420VideoSource : public VideoSource {
   unsigned int frame_;
   unsigned int width_;
   unsigned int height_;
-  unsigned int framerate_numerator_;
-  unsigned int framerate_denominator_;
+  int framerate_numerator_;
+  int framerate_denominator_;
 };
 
 }  // namespace libvpx_test
diff --git a/libvpx/test/idct_test.cc b/libvpx/test/idct_test.cc
index 659cce0..aa786cb 100644
--- a/libvpx/test/idct_test.cc
+++ b/libvpx/test/idct_test.cc
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 extern "C" {
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
@@ -22,100 +21,94 @@ typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
                           int dst_stride);
 namespace {
 class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {
-  protected:
-    virtual void SetUp() {
-        int i;
-
-        UUT = GetParam();
-        memset(input, 0, sizeof(input));
-        /* Set up guard blocks */
-        for (i = 0; i < 256; i++)
-            output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
-    }
-
-    virtual void TearDown() {
-      libvpx_test::ClearSystemState();
-    }
-
-    idct_fn_t UUT;
-    short input[16];
-    unsigned char output[256];
-    unsigned char predict[256];
+ protected:
+  virtual void SetUp() {
+    int i;
+
+    UUT = GetParam();
+    memset(input, 0, sizeof(input));
+    /* Set up guard blocks */
+    for (i = 0; i < 256; i++) output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+  idct_fn_t UUT;
+  short input[16];
+  unsigned char output[256];
+  unsigned char predict[256];
 };
 
 TEST_P(IDCTTest, TestGuardBlocks) {
-    int i;
+  int i;
 
-    for (i = 0; i < 256; i++)
-        if ((i & 0xF) < 4 && i < 64)
-            EXPECT_EQ(0, output[i]) << i;
-        else
-            EXPECT_EQ(255, output[i]);
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) < 4 && i < 64)
+      EXPECT_EQ(0, output[i]) << i;
+    else
+      EXPECT_EQ(255, output[i]);
 }
 
 TEST_P(IDCTTest, TestAllZeros) {
-    int i;
+  int i;
 
-    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
-    for (i = 0; i < 256; i++)
-        if ((i & 0xF) < 4 && i < 64)
-            EXPECT_EQ(0, output[i]) << "i==" << i;
-        else
-            EXPECT_EQ(255, output[i]) << "i==" << i;
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) < 4 && i < 64)
+      EXPECT_EQ(0, output[i]) << "i==" << i;
+    else
+      EXPECT_EQ(255, output[i]) << "i==" << i;
 }
 
 TEST_P(IDCTTest, TestAllOnes) {
-    int i;
+  int i;
 
-    input[0] = 4;
-    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  input[0] = 4;
+  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
-    for (i = 0; i < 256; i++)
-        if ((i & 0xF) < 4 && i < 64)
-            EXPECT_EQ(1, output[i]) << "i==" << i;
-        else
-            EXPECT_EQ(255, output[i]) << "i==" << i;
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) < 4 && i < 64)
+      EXPECT_EQ(1, output[i]) << "i==" << i;
+    else
+      EXPECT_EQ(255, output[i]) << "i==" << i;
 }
 
 TEST_P(IDCTTest, TestAddOne) {
-    int i;
+  int i;
 
-    for (i = 0; i < 256; i++)
-        predict[i] = i;
-    input[0] = 4;
-    REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
+  for (i = 0; i < 256; i++) predict[i] = i;
+  input[0] = 4;
+  REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
 
-    for (i = 0; i < 256; i++)
-        if ((i & 0xF) < 4 && i < 64)
-            EXPECT_EQ(i+1, output[i]) << "i==" << i;
-        else
-            EXPECT_EQ(255, output[i]) << "i==" << i;
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) < 4 && i < 64)
+      EXPECT_EQ(i + 1, output[i]) << "i==" << i;
+    else
+      EXPECT_EQ(255, output[i]) << "i==" << i;
 }
 
 TEST_P(IDCTTest, TestWithData) {
-    int i;
-
-    for (i = 0; i < 16; i++)
-        input[i] = i;
-
-    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
-
-    for (i = 0; i < 256; i++)
-        if ((i & 0xF) > 3 || i > 63)
-            EXPECT_EQ(255, output[i]) << "i==" << i;
-        else if (i == 0)
-            EXPECT_EQ(11, output[i]) << "i==" << i;
-        else if (i == 34)
-            EXPECT_EQ(1, output[i]) << "i==" << i;
-        else if (i == 2 || i == 17 || i == 32)
-            EXPECT_EQ(3, output[i]) << "i==" << i;
-        else
-            EXPECT_EQ(0, output[i]) << "i==" << i;
+  int i;
+
+  for (i = 0; i < 16; i++) input[i] = i;
+
+  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) > 3 || i > 63)
+      EXPECT_EQ(255, output[i]) << "i==" << i;
+    else if (i == 0)
+      EXPECT_EQ(11, output[i]) << "i==" << i;
+    else if (i == 34)
+      EXPECT_EQ(1, output[i]) << "i==" << i;
+    else if (i == 2 || i == 17 || i == 32)
+      EXPECT_EQ(3, output[i]) << "i==" << i;
+    else
+      EXPECT_EQ(0, output[i]) << "i==" << i;
 }
 
-INSTANTIATE_TEST_CASE_P(C, IDCTTest,
-                        ::testing::Values(vp8_short_idct4x4llm_c));
+INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
                         ::testing::Values(vp8_short_idct4x4llm_mmx));
diff --git a/libvpx/test/intrapred_test.cc b/libvpx/test/intrapred_test.cc
index 39ec896..da96741 100644
--- a/libvpx/test/intrapred_test.cc
+++ b/libvpx/test/intrapred_test.cc
@@ -27,6 +27,8 @@ using libvpx_test::ACMRandom;
 
 class IntraPredBase {
  public:
+  virtual ~IntraPredBase() {}
+
   virtual void TearDown() {
     libvpx_test::ClearSystemState();
   }
diff --git a/libvpx/test/ivf_video_source.h b/libvpx/test/ivf_video_source.h
index 48c3a7d..926f801 100644
--- a/libvpx/test/ivf_video_source.h
+++ b/libvpx/test/ivf_video_source.h
@@ -47,12 +47,13 @@ class IVFVideoSource : public CompressedVideoSource {
   virtual void Init() {
     // Allocate a buffer for read in the compressed video frame.
     compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
-    ASSERT_TRUE(compressed_frame_buf_) << "Allocate frame buffer failed";
+    ASSERT_TRUE(compressed_frame_buf_ != NULL)
+        << "Allocate frame buffer failed";
   }
 
   virtual void Begin() {
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
         << file_name_;
 
     // Read file header
@@ -72,6 +73,7 @@ class IVFVideoSource : public CompressedVideoSource {
   }
 
   void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
     uint8_t frame_hdr[kIvfFrameHdrSize];
     // Check frame header and read a frame from input_file.
     if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_)
diff --git a/libvpx/test/keyframe_test.cc b/libvpx/test/keyframe_test.cc
index 85ca0b9..f7572e8 100644
--- a/libvpx/test/keyframe_test.cc
+++ b/libvpx/test/keyframe_test.cc
@@ -31,10 +31,6 @@ class KeyframeTest : public ::libvpx_test::EncoderTest,
     set_cpu_used_ = 0;
   }
 
-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
     if (kf_do_force_kf_)
diff --git a/libvpx/test/resize_test.cc b/libvpx/test/resize_test.cc
index 0d591ad..7412a24 100644
--- a/libvpx/test/resize_test.cc
+++ b/libvpx/test/resize_test.cc
@@ -70,10 +70,6 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
     SetMode(GET_PARAM(1));
   }
 
-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
   virtual void DecompressedFrameHook(const vpx_image_t &img,
                                      vpx_codec_pts_t pts) {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
diff --git a/libvpx/test/sad_test.cc b/libvpx/test/sad_test.cc
index 1f5435f..bf3e0b8 100644
--- a/libvpx/test/sad_test.cc
+++ b/libvpx/test/sad_test.cc
@@ -452,10 +452,14 @@ const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;
 #endif
 #if CONFIG_VP9_ENCODER
 const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
+const sad_m_by_n_fn_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
+const sad_m_by_n_fn_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
 const sad_m_by_n_fn_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
+const sad_m_by_n_fn_t sad_32x16_sse2_vp9 = vp9_sad32x16_sse2;
+const sad_m_by_n_fn_t sad_16x32_sse2_vp9 = vp9_sad16x32_sse2;
 const sad_m_by_n_fn_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
-const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
 const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
+const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
 const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
 const sad_m_by_n_fn_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
 #endif
@@ -469,10 +473,14 @@ const sad_m_by_n_test_param_t sse2_tests[] = {
 #endif
 #if CONFIG_VP9_ENCODER
   make_tuple(64, 64, sad_64x64_sse2_vp9),
+  make_tuple(64, 32, sad_64x32_sse2_vp9),
+  make_tuple(32, 64, sad_32x64_sse2_vp9),
   make_tuple(32, 32, sad_32x32_sse2_vp9),
+  make_tuple(32, 16, sad_32x16_sse2_vp9),
+  make_tuple(16, 32, sad_16x32_sse2_vp9),
   make_tuple(16, 16, sad_16x16_sse2_vp9),
-  make_tuple(8, 16, sad_8x16_sse2_vp9),
   make_tuple(16, 8, sad_16x8_sse2_vp9),
+  make_tuple(8, 16, sad_8x16_sse2_vp9),
   make_tuple(8, 8, sad_8x8_sse2_vp9),
   make_tuple(8, 4, sad_8x4_sse2_vp9),
 #endif
diff --git a/libvpx/test/subtract_test.cc b/libvpx/test/subtract_test.cc
index 81bfb66..574bfbf 100644
--- a/libvpx/test/subtract_test.cc
+++ b/libvpx/test/subtract_test.cc
@@ -61,7 +61,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
     int16_t *src_diff = be.src_diff;
     for (int r = 0; r < kBlockHeight; ++r) {
       for (int c = 0; c < kBlockWidth; ++c) {
-        src_diff[c] = 0xa5a5;
+        src_diff[c] = static_cast<int16_t>(0xa5a5);
       }
       src_diff += kDiffPredStride;
     }
diff --git a/libvpx/test/superframe_test.cc b/libvpx/test/superframe_test.cc
index 062ec6c..d91e7b1 100644
--- a/libvpx/test/superframe_test.cc
+++ b/libvpx/test/superframe_test.cc
@@ -33,10 +33,6 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
     delete[] modified_buf_;
   }
 
-  virtual bool Continue() const {
-    return !HasFatalFailure() && !abort_;
-  }
-
   virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                   libvpx_test::Encoder *encoder) {
     if (video->frame() == 1) {
diff --git a/libvpx/test/test-data.sha1 b/libvpx/test/test-data.sha1
index 1036d7c..0ac4905 100644
--- a/libvpx/test/test-data.sha1
+++ b/libvpx/test/test-data.sha1
@@ -122,223 +122,401 @@ f95eb6214571434f1f73ab7833b9ccdf47588020  vp80-03-segmentation-1437.ivf.md5
 086c56378df81b6cee264d7540a7b8f2b405c7a4  vp80-05-sharpness-1439.ivf.md5
 d32dc2c4165eb266ea4c23c14a45459b363def32  vp80-05-sharpness-1440.ivf.md5
 8c69dc3d8e563f56ffab5ad1e400d9e689dd23df  vp80-05-sharpness-1443.ivf.md5
-c5b6fc822d7b4ed97b5a0d69e3a71d9de6cab815  vp90-00-akiyo-100.webm
-1cd8ee73b53f4ecc2511effd233f9af6ecdfac7e  vp90-00-akiyo-100.webm.md5
-a854b0f2313efde7767a4465afbcbe35005ffb07  vp90-00-akiyo-200.webm
-b0f53ad309611246821174b642f6808cc1e670de  vp90-00-akiyo-200.webm.md5
-38a5c0e5465f884474b1a5a9184685f17f961ba1  vp90-00-akiyo-300.webm
-756a34417fc10dc2a49464eccaa6b7f987227b57  vp90-00-akiyo-300.webm.md5
-1047e6f19dd137ae7bbd5b93d407fc7186f8a98e  vp90-00-akiyo-50.webm
-0fa08a76901a6a5b2d4b58a6b20bfa5239409b9d  vp90-00-akiyo-50.webm.md5
-767511b25dde2c5926f5284782a9f1e04fe7afda  vp90-00-bowing-150.webm
-b259c3c6afb30fd1ae7d3a563c1fe9fe6a4644cd  vp90-00-bowing-150.webm.md5
-2ef831c75c021a03176536fb652196e9afc37888  vp90-00-bowing-25.webm
-37d3522cd76b7bab3b5e973e2b2c51edea49ef3f  vp90-00-bowing-25.webm.md5
-c1e4639f14914516ca704f38c875d01f4c06be14  vp90-00-bowing-400.webm
-ca35c574512185d5f20f3b81517d6ac3333a1377  vp90-00-bowing-400.webm.md5
-e20fc293db095e52f29b891bc09458e7568e8603  vp90-00-bus-100.webm
-a754ea588cc409546936c09fb1ad06b3014b94f9  vp90-00-bus-100.webm.md5
-da5eb45fa42f55ff70ec7b71999e6fd8489d12f9  vp90-00-bus-2000.webm
-2a7356328eb991175cbddebd51a30018e48632f2  vp90-00-bus-2000.webm.md5
-607169c774664176aca7c7d46dabf04b9c3634e4  vp90-00-bus-300.webm
-c84daa3a0290d73226b243dd630820ac97bf4fbd  vp90-00-bus-300.webm.md5
-655902b54b9a8a882c11bc8bce1447f3b2085035  vp90-00-bus-4400.webm
-f719ecd7b53c8e35fae735396629d1915ffc1ff9  vp90-00-bus-4400.webm.md5
-afcdca9763d233dd63fd67165a7b92ea679822af  vp90-00-bus-800.webm
-66e2a55560e570cae09520060f1ae315c7ea0a07  vp90-00-bus-800.webm.md5
-390b91c8566d94c3a869af77531585c38f9f78da  vp90-00-cheer-1600.webm
-3d47da26375a75afef0cf2123f5c808d0862e25d  vp90-00-cheer-1600.webm.md5
-23419784db17a50e129e3bd030c20256cf0d6eb0  vp90-00-cheer-2800.webm
-0df4676171f19e7807d719a9b8a6fadcefc8f1fc  vp90-00-cheer-2800.webm.md5
-45ed3c42874d5ec88852798691cf54bfb0cf652a  vp90-00-cheer-400.webm
-374fd67ac9ae0e8146051b77963459c54b9eaaa2  vp90-00-cheer-400.webm.md5
-1c9459d824116a297ff0e90bed9be783005f9ac1  vp90-00-cheer-600.webm
-9dc0d43f72c8eb49d51a9748fb9948495529a6b5  vp90-00-cheer-600.webm.md5
-a86c5af1929d2f929a5caf6ef847d0066086223b  vp90-00-city-1200.webm
-231c7f0f406e3a8d2328daee4c4466e1b4d47354  vp90-00-city-1200.webm.md5
-be9cf927e6ab517d7876925d21b3193b1373d03d  vp90-00-city-2000.webm
-487d60226a3a3039528a049e9c6e8243b07404e6  vp90-00-city-2000.webm.md5
-1f3cd649d5829d52c08da3323baa86b1dcf2d2de  vp90-00-city-300.webm
-8e3b38cfa2be757e46ea12cff11762cb50134615  vp90-00-city-300.webm.md5
-286f6ea64c33ce735b5b7806aac4ca5ee331af66  vp90-00-city-600.webm
-7c51ead147ef4029094a2b455239090c1999d8fe  vp90-00-city-600.webm.md5
-f7ecbd63bed06ed15afe0ba2a192f2cf7943714c  vp90-00-coastguard-1200.webm
-8c8fed2c64cc8fb330e9200e1e0f58a79b953b79  vp90-00-coastguard-1200.webm.md5
-2e63178e5b2c2cc84226df2b514c4dde46c32d70  vp90-00-coastguard-200.webm
-128f2b22fdcfd02bc50e63b1cd6d40c0cc4998d6  vp90-00-coastguard-200.webm.md5
-97b779617d3c1ca8f50beda7126be5df913d071d  vp90-00-coastguard-3600.webm
-0da0ab4794439e6b8ab9ced41239e1307686be69  vp90-00-coastguard-3600.webm.md5
-5e060d66573a40f7f0a46ae9b6acb51b0afb2e3c  vp90-00-coastguard-5200.webm
-4ba526d4bb895c4794dc20edeb38b102a9b1bd92  vp90-00-coastguard-5200.webm.md5
-17810fa737f29d5b032836e38243bbb666f06636  vp90-00-container-1000.webm
-7e0fd7e93c5a16394818f844aa5f2d5fa7a73ee2  vp90-00-container-1000.webm.md5
-38deb4f59cec9e62715dec2f3670ffe7b1cf493e  vp90-00-container-200.webm
-aa3229017f920750bd5d919e19ea6127ea05adc0  vp90-00-container-200.webm.md5
-8b1a67ef35d3f00981d23c41b56a0a2e09976312  vp90-00-container-50.webm
-0a6f1a793b936ff1287326882f1165065a2dcea0  vp90-00-container-50.webm.md5
-4c724db691b7202b60b56107ec7b0abc6cc52bdc  vp90-00-deadline-1000.webm
-5903bd89be457be681a6c6c8fd8c19f4570173db  vp90-00-deadline-1000.webm.md5
-ee5e19a8fe14d3e72b1314a012b49a3bc0586375  vp90-00-deadline-200.webm
-77095f98406fa27a2da8661f21664c00292dcefc  vp90-00-deadline-200.webm.md5
-8230b07aa0ee7adf3caabae4e3bef997929001eb  vp90-00-deadline-50.webm
-fc47a159b2d2b0bed93d4e2c35408243e70b6d24  vp90-00-deadline-50.webm.md5
-244d12cda51235dcc421fedbe12422b326f539e7  vp90-00-flower-100.webm
-dfeca236450b5ff19c1558ad33fba7ab7ff75f27  vp90-00-flower-100.webm.md5
-d5b7057564f670f7bf82017e2abc3aed5656b810  vp90-00-flower-2000.webm
-65118811f4d46ef1e911d520296731536d3a507e  vp90-00-flower-2000.webm.md5
-a9c226643365f0c8ae03e780d55aa6c6fa9cc0e7  vp90-00-flower-300.webm
-fa5193d1a6e6b9e8bb91f75e91a3a377f00fa42e  vp90-00-flower-300.webm.md5
-b206284b51dec6219c46e9b03def38a94d91bf89  vp90-00-flower-4400.webm
-c8a73acd8234b287e86465d03fbf4f886d1fefb2  vp90-00-flower-4400.webm.md5
-faff83d7b6aa89f5d9518ffc5d4b145eb02b6800  vp90-00-flower-800.webm
-328dd1969804afc094d010f54f350bd05390d6a9  vp90-00-flower-800.webm.md5
-42caa40d3b76b8ae5e7573b95e09bc4e57bea835  vp90-00-football-1600.webm
-167b8f58a85d83050d4c56391d6b2d9a9a205b9a  vp90-00-football-1600.webm.md5
-4c4f93f594a8ef89a9ba903bbcff914022a5ad9d  vp90-00-football-2800.webm
-7995f7f91b13d4ab5badcd3f9282bd1fceba38f3  vp90-00-football-2800.webm.md5
-c3ff724e79b4ae0202929f3ed1a1a5b67d10901f  vp90-00-football-400.webm
-19164a0e58ca5d407282a867866e8ec4a0a08fea  vp90-00-football-400.webm.md5
-95de1c4abceab3706f0225e3b9c5dc719901a6cf  vp90-00-football-600.webm
-4a4454ae4d65748a45eaa3decb783bbe0ba190dc  vp90-00-football-600.webm.md5
-80eebcdae76459c00d14b6c50f7529377e53a1c2  vp90-00-foreman-1200.webm
-8228cc5a7cc83970b3a65f9b49bc74733255b09c  vp90-00-foreman-1200.webm.md5
-601d0ff4f058a3da3af4409e4117795f7c231fda  vp90-00-foreman-2000.webm
-e0c0b0aa6f9597984a2d78e799a00e0052710b2c  vp90-00-foreman-2000.webm.md5
-30ebc327645d68bcc83eab72610bba22f877fb4c  vp90-00-foreman-300.webm
-080fc2adf29a84f02a3e4b5508fc2f8dc32f1440  vp90-00-foreman-300.webm.md5
-6b1a6be0f7bd7605b565750b3080be397d4c48a0  vp90-00-foreman-600.webm
-f7713d3eba8d34d511ba1c9585a5a3f34e133ba5  vp90-00-foreman-600.webm.md5
-b080d9786abc89b4be59bffc5baba7b42fbc286a  vp90-00-hallmonitor-1200.webm
-77be47800b58001eb7a854d4d4a9b9823bbbe158  vp90-00-hallmonitor-1200.webm.md5
-05cd8e8d58ab8311ad528c27b4c89cdf268e749b  vp90-00-hallmonitor-2000.webm
-de1aa35c7172e78e07d6b197280214bbd362cc4e  vp90-00-hallmonitor-2000.webm.md5
-908676b32b190e956518bb742d1415efceeb8c75  vp90-00-hallmonitor-300.webm
-f9d39866db341d18256339e9fd2c0ec296f47702  vp90-00-hallmonitor-300.webm.md5
-1307c7f7558de34a6230912e684ff9571a05db5f  vp90-00-hallmonitor-600.webm
-954b292dd56be5c1bf153df440b132e1b1fbcb68  vp90-00-hallmonitor-600.webm.md5
-05f556288c5c4211420f7c332daded816f9b31b7  vp90-00-harbour-1200.webm
-399481f93cc252f20ad5141dd402cf5363673578  vp90-00-harbour-1200.webm.md5
-fa62e449485c544c281030c5ccff32c60d4dd169  vp90-00-harbour-200.webm
-3d0e1885befb2493c477384917797164d4fe58e4  vp90-00-harbour-200.webm.md5
-fa3a5e563c3d2215703c1a68f71fbe2168a42468  vp90-00-harbour-3600.webm
-9af392f6b2cb5ec5c9446b7262206773df535319  vp90-00-harbour-3600.webm.md5
-476db4b15989a5a078f1d2fc5f9734d1d24f1da1  vp90-00-harbour-5200.webm
-352a05b179dc1f86cf6ce27494a4a8fb42379d72  vp90-00-harbour-5200.webm.md5
-0ea17a4892383a2fd0be9f88f213f5f48f2a61f4  vp90-00-highway-100.webm
-a2fe942955bafa83295d1381c9a25264764924c5  vp90-00-highway-100.webm.md5
-7ab80485670a5343a74c4a2454761ed3bed7ceef  vp90-00-highway-1600.webm
-fda9c82cb5d28a5ff5f7dae7c537e9187dfbd4cc  vp90-00-highway-1600.webm.md5
-162d42e033dad04fd7ae3bf9d39e9e204c022edc  vp90-00-highway-2800.webm
-b882c93a2dc89feb6090b0f72e67ac8a59fc0986  vp90-00-highway-2800.webm.md5
-79b9a0e6fa6cdd2367228e9ac8d6a369a8d647e6  vp90-00-highway-50.webm
-80ecf926372dbe8c1b4bcd68ea2101f78a93b02e  vp90-00-highway-50.webm.md5
-a67fd02cbb75c1a757b5ea56b9eee46069bfadbf  vp90-00-husky-100.webm
-12cd583e791c8e5b40b5dffe4a9dbcc1929dc645  vp90-00-husky-100.webm.md5
-1a8b4302eb6f88b14a9acd4a6cbe62d0b380f2e4  vp90-00-husky-2000.webm
-a9c2532e5d867d7627bb6767008b43b653cce904  vp90-00-husky-2000.webm.md5
-f56f66afd4d4512a49904275a1c942ba7379fec4  vp90-00-husky-300.webm
-196dc386f104b7b9ed2ec6c6a1f104ce0319c2eb  vp90-00-husky-300.webm.md5
-6ba3c16fd98d37a8de7023419682a3595778b9bc  vp90-00-husky-4400.webm
-2f4815ba97e352fcd0089d1a5883a0aff1e5394a  vp90-00-husky-4400.webm.md5
-db04a296c377693dd6e974bea36256f4b14cddef  vp90-00-husky-800.webm
-7658473ad17ee689a37fda558c5a23816131cfc3  vp90-00-husky-800.webm.md5
-50cf9e34b61e1cf32c9dde2ebcc5f5703c379a41  vp90-00-ice-150.webm
-806ceba91dc40c45eafc4d7ee61df9346c6fe5f9  vp90-00-ice-150.webm.md5
-4cfca1bea7aae6e4405abfca603cfbded13ded1a  vp90-00-ice-400.webm
-e4298abf05419973da89c0bfcdf0006b1606ebcd  vp90-00-ice-400.webm.md5
-12e3ccfdf96c3f4eebeed8106c5daef6c2b28d83  vp90-00-ice-800.webm
-6fb2aacb4d8131dcabaa61a9cd2497cd09854377  vp90-00-ice-800.webm.md5
-124977938c47ba739e918533bc5d6d73e41ce2ec  vp90-00-mobile-1600.webm
-603b2b523c8ed5922121d285567a845bb6693d35  vp90-00-mobile-1600.webm.md5
-93f204b90250791b884479be5da534a5bc6304ff  vp90-00-mobile-2800.webm
-21ec8735b774c66e192f7270c12075f598f700d5  vp90-00-mobile-2800.webm.md5
-fe9cdbfdeee2b7554efb532f646703cff55c2d2c  vp90-00-mobile-400.webm
-4def63c78ee09e90e6385d3122ada95343246102  vp90-00-mobile-400.webm.md5
-2a042aa8a06c45770dcb52c56a7f5cea6d51b8dd  vp90-00-mobile-600.webm
-03169f031dece0db3d89ce16cc3e0ee3eca21065  vp90-00-mobile-600.webm.md5
-7fc5b0b0c684d63e161c9c5932e1374327e15dd4  vp90-00-motherdaughter-100.webm
-290ac7722caf4b15136b307a239c9b903113b9c4  vp90-00-motherdaughter-100.webm.md5
-67ddfce82bff083a1ceb108a7dcfb801791102f1  vp90-00-motherdaughter-300.webm
-7696698d38e32f0afeb3a3e9a45b7fe3f237aaba  vp90-00-motherdaughter-300.webm.md5
-ff65a1bee2fe384728017c5148df61379043d5b6  vp90-00-motherdaughter-600.webm
-f0b167000bf40877d1ba7ba52a08b4310011c032  vp90-00-motherdaughter-600.webm.md5
-d73c54e676bd63424fc9ad8d0cef64e929081cf4  vp90-00-news-100.webm
-71821b71a97823e9ba58563efc841dc6beefe9df  vp90-00-news-100.webm.md5
-2937238d094863951eb8f218438b966d2b7b5430  vp90-00-news-300.webm
-2587d0859a330cf6d8e0a135d1f586bb2a5033fc  vp90-00-news-300.webm.md5
-65afdd4fc411951115b48435b8b65155594b5c99  vp90-00-news-600.webm
-5815bb341db976f44dab97bb9cfba8ea0ca55502  vp90-00-news-600.webm.md5
-de5dd99ac04d3a937fc0951d06fb8f533fdc393a  vp90-00-pamphlet-150.webm
-0381d705fa490f35c772e3048b423b382088d546  vp90-00-pamphlet-150.webm.md5
-46f283284cb64b79243b2ea6aad709a526c26393  vp90-00-pamphlet-25.webm
-f100fbebcad96f27ed8f340414b939bc738d49d0  vp90-00-pamphlet-25.webm.md5
-8df04ece12455c5c40f14cb089348260798c5f2b  vp90-00-pamphlet-400.webm
-66a2c87cd4194368d3477e9a334880b76c87e991  vp90-00-pamphlet-400.webm.md5
-a00e97e4a71f5e24f194c59cde7d41bc2c3af325  vp90-00-paris-1000.webm
-53ef896e16d1b83aa5166945d149c7133401b3f0  vp90-00-paris-1000.webm.md5
-6b03388e0236f6171e20c73834858e3c87b441b2  vp90-00-paris-200.webm
-55a324b0153c5d54cd0c0492fed8755c441fa18c  vp90-00-paris-200.webm.md5
-429ec362a9600c8822652cf7e122e22bca033d69  vp90-00-paris-50.webm
-4406226b7bddb11ede8ee0c442d52e5d3bbbde78  vp90-00-paris-50.webm.md5
-a7996d4e757ea484aa72e14f623d6c9e72537888  vp90-00-signirene-1000.webm
-f65a1ac6e1ce77102e63fb363dbca361b8108c02  vp90-00-signirene-1000.webm.md5
-8c2f686179bc3e87a18b48bcb5058f3cd61e1b4c  vp90-00-signirene-200.webm
-b8ab16cba9392e49169c374eb1e0c1b763ccaefb  vp90-00-signirene-200.webm.md5
-5f8f99c386dce64931bbd4fc42a59a78dc6fdba1  vp90-00-signirene-50.webm
-fdb8c4bc302884d413a256634d3e2fbd92867c90  vp90-00-signirene-50.webm.md5
-d5074f0a5bcefe9fd651afbbebf0e0f3fedb965b  vp90-00-silent-1000.webm
-9c075894fbfb84791fcc7dbd3fcab15b0a9bd64e  vp90-00-silent-1000.webm.md5
-32101f334f675715a8f411638dfda80afacc37a6  vp90-00-silent-200.webm
-fb0dac37f31ca711443832046a6aaf868e69b357  vp90-00-silent-200.webm.md5
-0aaef50d7f94873e99ec7e39f59a6b74e92ad946  vp90-00-silent-50.webm
-be9fc41965b5b63f7c7bbd6c91191e940903e012  vp90-00-silent-50.webm.md5
-5e22ad14c562733d4d4a3ce163b580ed4a64e6fe  vp90-00-soccer-100.webm
-1ca9a0016910cfca26def9944568749a168131d8  vp90-00-soccer-100.webm.md5
-2d9b2a0fa5ac210f8d7c646578698e045733ad4a  vp90-00-soccer-2000.webm
-f979078650057606ca770b3f03be4c509efb40a9  vp90-00-soccer-2000.webm.md5
-7b789360ffc1eb5a3735f8a1f8d248a24ca4267c  vp90-00-soccer-300.webm
-195d33b23ca8304519bd6e38e9657e53a04779d8  vp90-00-soccer-300.webm.md5
-3907318ef35573e4efc5c150d3aff271c7157501  vp90-00-soccer-4400.webm
-4b43ceecae9a9a7d39a47347f9e20af3613827d1  vp90-00-soccer-4400.webm.md5
-c89920aa89194cb6a36f77dff8722573f0df7241  vp90-00-soccer-800.webm
-1da71751009afa483a03e274a538df24c9f5e513  vp90-00-soccer-800.webm.md5
-efca14e8e0515a8f8ed3ded11fdbff24b09a7f9d  vp90-00-stefan-1600.webm
-6f103270ce03cc85b28dd1c86d0447922d810671  vp90-00-stefan-1600.webm.md5
-b99ab6a983d48c15aa3a9160d06286fca0074193  vp90-00-stefan-2800.webm
-986a72dd9988c6bf4246cd5bd966ce991ba55319  vp90-00-stefan-2800.webm.md5
-eb962244ca51a101ad8f585df6be8f5f96691f18  vp90-00-stefan-400.webm
-2747cfd8f74aedc370767f08129b35ace70e1fe7  vp90-00-stefan-400.webm.md5
-b507b8cedd0147c5316db8f84f35ace768c25069  vp90-00-stefan-600.webm
-daeb369046c2dc27ecfde978b87fd8b49d83789f  vp90-00-stefan-600.webm.md5
-c5c2dd891c2b5fe4a70845858ccb859df3455ee7  vp90-00-students-100.webm
-d1be06dc636ece0c34ab8c17399888aaf19e0c19  vp90-00-students-100.webm.md5
-c9e4da3a8b455aa690d89338f32f9d76773cdd18  vp90-00-students-300.webm
-a9aa72e1ee27063f8e9f13b4647cec01c8efb2d6  vp90-00-students-300.webm.md5
-e9e5072cd944a8994e50fce367975e3ce526bd67  vp90-00-students-600.webm
-86525ce188a98a51f86fad27341729bb61d1ca8b  vp90-00-students-600.webm.md5
-58deb053aeafefdfdf13741accf9fcbe4584ea94  vp90-00-tempete-1200.webm
-ec395a2ec76b4c1e64e243366a8840da22ee3a65  vp90-00-tempete-1200.webm.md5
-5d35232eaa8ee149a917ff94536968fb37dad50e  vp90-00-tempete-200.webm
-7f8c7529f40d6b6d6de8e89dbf9697623d27c234  vp90-00-tempete-200.webm.md5
-c44eb147bc3f8682b96096fccef8beb4380c40db  vp90-00-tempete-3600.webm
-01fd23e412530fa2d5319a22886161957a747ee0  vp90-00-tempete-3600.webm.md5
-56ab322b34a750e16dcc8ccfb735a5b9270cedc4  vp90-00-tempete-5200.webm
-1cf803409ae53b991bff10079af4ab07aaa2853d  vp90-00-tempete-5200.webm.md5
-ffe48d52019c228e919f4b123028664b8d0c2f4b  vp90-00-tennis-100.webm
-406fda3367899995d4e37170063495832e2be372  vp90-00-tennis-100.webm.md5
-6c030f8142b1932fbe8eb5c2b39b3452a5eea3aa  vp90-00-tennis-2000.webm
-dcf20921e2a8ab0dcd09f7f6bdcdd35f979205ae  vp90-00-tennis-2000.webm.md5
-3fe0df7b74f301b39e1b21e6926c69a8418b9b70  vp90-00-tennis-300.webm
-80c8301d3a37b33ca50318ba000066a6ae9929dc  vp90-00-tennis-300.webm.md5
-82a2497083b8dce6b1c73bcdf16323ea69d1cca9  vp90-00-tennis-4400.webm
-83ce97bc09a7e1b2f2c3437195a8931d7608a62b  vp90-00-tennis-4400.webm.md5
-2c8bd3a29bbd1085169bfcba9fdf65a37f4a16bb  vp90-00-tennis-800.webm
-9920a65e06d2e7025f13f3d8bf35670503875aed  vp90-00-tennis-800.webm.md5
-26469062c5724c2cc4914436ef032bb55373f843  vp90-00-waterfall-150.webm
-9b86373ce15302a9b22cef8f808ce0e37e6d2b65  vp90-00-waterfall-150.webm.md5
-410ba6af2ddca5110fa7a4c383dc8b28f38cf565  vp90-00-waterfall-200.webm
-251892d3fdcbc9d7a20c22ba202ed4935222e5b8  vp90-00-waterfall-200.webm.md5
-40b643aff88aed3764c5b58c446a8fbbc5fb36d7  vp90-00-waterfall-400.webm
-51f31a6b6408f8af4d107e0f2a3c1a274d4da6bb  vp90-00-waterfall-400.webm.md5
-bd421141e01f53dc15ced790f9a96ab70a613260  vp90-00-waterfall-800.webm
-1366efe772fccaa2b8a6ac3ce45255b312a2ef6c  vp90-00-waterfall-800.webm.md5
+ce881e567fe1d0fbcb2d3e9e6281a1a8d74d82e0  vp90-2-00-quantizer-00.webm
+ac5eda33407d0521c7afca43a63fd305c0cd9d13  vp90-2-00-quantizer-00.webm.md5
+2ca0463f2cfb93d25d7dded174db70b7cb87cb48  vp90-2-00-quantizer-01.webm
+10d98884fc6d9a5f47a2057922b8e25dd48d7786  vp90-2-00-quantizer-01.webm.md5
+d80a2920a5e0819d69dcba8fe260c01f820f8982  vp90-2-00-quantizer-02.webm
+c964c8e5e04165fabbf1c6ee8ee5121d35921965  vp90-2-00-quantizer-02.webm.md5
+fdef046777b5b75c962b715d809dbe2ea331afb9  vp90-2-00-quantizer-03.webm
+f270bee0b0c7aa2bf4c5afe098556b4f3f890faf  vp90-2-00-quantizer-03.webm.md5
+66d98609e809394a6ac730787e6724e3badc075a  vp90-2-00-quantizer-04.webm
+427433bfe121c4aea1095ec3124fdc174d200e3a  vp90-2-00-quantizer-04.webm.md5
+e6e42626d8cadf0b5be16313f69212981b96fee5  vp90-2-00-quantizer-05.webm
+c98f6a9a1af4cfd71416792827304266aad4bd46  vp90-2-00-quantizer-05.webm.md5
+413ef09b721f5dcec1a96e937a97e5873c2e6db6  vp90-2-00-quantizer-06.webm
+5080e940a23805c82e578e21b57fc2c511e76376  vp90-2-00-quantizer-06.webm.md5
+4a50a5f4ac717c30dfaae8bb46702e3542e867de  vp90-2-00-quantizer-07.webm
+76c429a02b56762e10ee4db88729d8834b3a70f4  vp90-2-00-quantizer-07.webm.md5
+d2f4e464780bf8b7e647efa18ac777a930e62bc0  vp90-2-00-quantizer-08.webm
+ab94aabf9316111b52d7c531962ed4123313b6ba  vp90-2-00-quantizer-08.webm.md5
+174bc58433936dd79550398d744f1072ce7f5693  vp90-2-00-quantizer-09.webm
+e1f7690cd83ccc56d045e17cce552544a5f03810  vp90-2-00-quantizer-09.webm.md5
+52bc1dfd3a97b24d922eb8a31d07527891561f2a  vp90-2-00-quantizer-10.webm
+9b37bed893b5f6a4e12f2aa40f02dd40f944d0f8  vp90-2-00-quantizer-10.webm.md5
+10031eecafde1e1d8e6323fe2b2a1d7e77a66869  vp90-2-00-quantizer-11.webm
+fe4620a4bb0e4f5cb9bbfedc4039a22b81b0f5c0  vp90-2-00-quantizer-11.webm.md5
+78e9f7bb77e8e348155bbdfa12790789d1d50c34  vp90-2-00-quantizer-12.webm
+0961d060cc8dd469c6dac8d7d75f927c0bb971b8  vp90-2-00-quantizer-12.webm.md5
+133b77a3bbcef652552d74ffc46afbfe3b8a1cba  vp90-2-00-quantizer-13.webm
+df29e5e0f95772af482f540d776f6b9dea4bfa29  vp90-2-00-quantizer-13.webm.md5
+27323afdaf8987e025c27129c74c86502315a206  vp90-2-00-quantizer-14.webm
+ce96a2cc312942f0427a463f15a392870dd69764  vp90-2-00-quantizer-14.webm.md5
+ab58d0b41037829f6bc993910999f4af0212aafd  vp90-2-00-quantizer-15.webm
+40f700db606501aa7cb49049624cbdde6409b122  vp90-2-00-quantizer-15.webm.md5
+cd948e66448aafb65998815ce37241f95d7c9ee7  vp90-2-00-quantizer-16.webm
+039b742d149c945ed79c7b9a6384352852a1c116  vp90-2-00-quantizer-16.webm.md5
+62f56e663e13c576764e491cf08f19bd46a71999  vp90-2-00-quantizer-17.webm
+90c5a39bf76e6b3e0a1c0d3e9b68a9fd78be963e  vp90-2-00-quantizer-17.webm.md5
+f26ecad7263cd66a614e53ba5d7c00df181affeb  vp90-2-00-quantizer-18.webm
+cda0a1c0fca2ec2976ae55124a8a67305508bae6  vp90-2-00-quantizer-18.webm.md5
+94bfc4c04fcfe139a63b98c569e8c14ba98c401f  vp90-2-00-quantizer-19.webm
+5b8ec169ccf67d8a0a8e46a62eb173f5a1dbaf4f  vp90-2-00-quantizer-19.webm.md5
+0ee88e9318985e1e245de78c2c4a665885ab76a7  vp90-2-00-quantizer-20.webm
+4b26f7edb4fcd3a1b4cce9ba3cb8650e3ee6e063  vp90-2-00-quantizer-20.webm.md5
+6a995cb2b1db33da8087321df1e646f95c3e32d1  vp90-2-00-quantizer-21.webm
+e216b4a1eceac03efcc433759be54ab8ea87b24b  vp90-2-00-quantizer-21.webm.md5
+aa7722fc427e7180115f3c9cd96bb6b2768e7296  vp90-2-00-quantizer-22.webm
+1aa813bd45ae831bf5e79ace4d73dfd25989a07d  vp90-2-00-quantizer-22.webm.md5
+7677e5b929ed6d142041f19b8a9cd5822ee1504a  vp90-2-00-quantizer-23.webm
+0de0af34abd843d5b37e58baf3ed96a6104b64c3  vp90-2-00-quantizer-23.webm.md5
+b2995cbe1128b2d4926f1b28d01c501ecb6be8c8  vp90-2-00-quantizer-24.webm
+db6033af2ba2f2bca62468fb4b8808e474f93923  vp90-2-00-quantizer-24.webm.md5
+8135ba35587fd92cd4667be7896323d9b634401c  vp90-2-00-quantizer-25.webm
+3499e00c2cc15876f61f07e3d3cfca54ebcd98fd  vp90-2-00-quantizer-25.webm.md5
+af0fa2907746db82d345f6d831fcc1b2862a29fb  vp90-2-00-quantizer-26.webm
+cd6fe3d14dab48886ebf65be00e6ed9616ebe5a7  vp90-2-00-quantizer-26.webm.md5
+bd0002e91323776beb5ff11e06edcf19fc08e9b9  vp90-2-00-quantizer-27.webm
+fe72154ef196067d6c272521012dd79706496cac  vp90-2-00-quantizer-27.webm.md5
+fc15eb606f81455ff03df16bf3432296b002c43c  vp90-2-00-quantizer-28.webm
+40b2e24b542206a6bfd746ef199e49ccea07678a  vp90-2-00-quantizer-28.webm.md5
+3090bbf913cad0b2eddca7228f5ed51a58378b8d  vp90-2-00-quantizer-29.webm
+eb59745e0912d8ed6c928268bcf265237c9ba93f  vp90-2-00-quantizer-29.webm.md5
+c615abdca9c25e1cb110d908edbedfb3b7c92b91  vp90-2-00-quantizer-30.webm
+ad0f4fe6733e4e7cdfe8ef8722bb341dcc7538c0  vp90-2-00-quantizer-30.webm.md5
+037d9f242086cfb085518f6416259defa82d5fc2  vp90-2-00-quantizer-31.webm
+4654b40792572f0a790874c6347ef9196d86c1a7  vp90-2-00-quantizer-31.webm.md5
+505899f3f3515044c5c8b3213d9b9d16f614619d  vp90-2-00-quantizer-32.webm
+659a2e6dd02df323f62600626859006640b445df  vp90-2-00-quantizer-32.webm.md5
+8b32ec9c3b7e5ca8ddc6b8aea1c1cb7ca996bccc  vp90-2-00-quantizer-33.webm
+5b175ef1120ddeba4feae1247bf381bbc4e816ce  vp90-2-00-quantizer-33.webm.md5
+4d283755d17e287b1d099a80604398f60d7fb6ea  vp90-2-00-quantizer-34.webm
+22a739de95acfeb27524e3700b8f678a9ad744d8  vp90-2-00-quantizer-34.webm.md5
+4296f56a892a412d3d4f64824718dd566c4e6459  vp90-2-00-quantizer-35.webm
+c532c9c8dc7b3506fc6a51e5c20c17ef0ac039e7  vp90-2-00-quantizer-35.webm.md5
+6f54e11da461e4410dd9075b015e2d9bc1d07dfb  vp90-2-00-quantizer-36.webm
+0b3573f5addea4e3eb11a0b85f068299d5bdad78  vp90-2-00-quantizer-36.webm.md5
+210581682a26c2c4375efc785c36e07539888bc2  vp90-2-00-quantizer-37.webm
+2b4fb6f8ba975237858e61cc8f560bcfc87cb38e  vp90-2-00-quantizer-37.webm.md5
+a15ef31283dfc4860f837fe200eb32a445f59629  vp90-2-00-quantizer-38.webm
+fb76771f3a795054b9936f70da7505c3ac585284  vp90-2-00-quantizer-38.webm.md5
+1df8433a441412831daae6726df89fa70d21b14d  vp90-2-00-quantizer-39.webm
+39e162c09a20e7e684868097766347014371fee6  vp90-2-00-quantizer-39.webm.md5
+5330e4788ab9129dbb25a7a7d5411104521248b6  vp90-2-00-quantizer-40.webm
+872cc0f2cc9dbf000f89eadb4d8f9940e48e00b1  vp90-2-00-quantizer-40.webm.md5
+d88d03b982889e399a78d7a06eeb1cf30e6c2da2  vp90-2-00-quantizer-41.webm
+5b4f7217e57fa2a221011d0b32f8d0409496b7b6  vp90-2-00-quantizer-41.webm.md5
+9e16406e3e26955a6e17d455ef1ef64bbfa26e53  vp90-2-00-quantizer-42.webm
+0219d090cf37daabe19256ba8e932ba4874b92e4  vp90-2-00-quantizer-42.webm.md5
+a9b15843486fb05f8cd15437ef279782a42b75db  vp90-2-00-quantizer-43.webm
+3c9b0b4c607f9579a31726bfcf56729334ddc686  vp90-2-00-quantizer-43.webm.md5
+1dbc931ac446c91eabe7213efff55b596cccf07c  vp90-2-00-quantizer-44.webm
+73bc8f675103abaef3d9f73a2742b3bffd726d23  vp90-2-00-quantizer-44.webm.md5
+7c6c1be15beb9d6201204b018966c8c4f9777efc  vp90-2-00-quantizer-45.webm
+c907b29da821f790c6748de61f592689312e4e36  vp90-2-00-quantizer-45.webm.md5
+07b434da1a467580f73b32177ee11b3e00f65a0d  vp90-2-00-quantizer-46.webm
+7b2b7ce60c50bc970bc0ada46d7a7ce440148da3  vp90-2-00-quantizer-46.webm.md5
+233d0465fb1a6fa36e9f89bd2193ac79bd4d2809  vp90-2-00-quantizer-47.webm
+527e0a9fb932efe915027ffe077f9e8d3a4fb139  vp90-2-00-quantizer-47.webm.md5
+719613df7307e205c3fdb6acfb373849c5ab23c7  vp90-2-00-quantizer-48.webm
+65ab6c9d1b682c183b201c7ff42b90343ce3e304  vp90-2-00-quantizer-48.webm.md5
+3bf04a598325ed0eabae1598ec7f718f715ec672  vp90-2-00-quantizer-49.webm
+ac68c4387ce11fcc998d8ba455ab9b2bb361d240  vp90-2-00-quantizer-49.webm.md5
+d59238fb3a654931c9b65a11e7321b40d1f702e9  vp90-2-00-quantizer-50.webm
+d0576bfede46fd55659f028f2fd28554ceb3e6cc  vp90-2-00-quantizer-50.webm.md5
+3f579785101d4209360dd96f8c2ffe9beddf3bee  vp90-2-00-quantizer-51.webm
+89fcfe04f4457a7f02ab4a2f94aacbb88aee5789  vp90-2-00-quantizer-51.webm.md5
+28be5836e2fedefe4babf12fc9b79e460ab0a0f4  vp90-2-00-quantizer-52.webm
+f3dd52b70c18345fee740220f35da9c4def2017a  vp90-2-00-quantizer-52.webm.md5
+488ad4058c17170665b6acd1021fade9a02771e4  vp90-2-00-quantizer-53.webm
+1cdcb1d4f3a37cf83ad235eb27ec62ed2a01afc7  vp90-2-00-quantizer-53.webm.md5
+682978289cb28cc8c9d39bc797300e45d6039de7  vp90-2-00-quantizer-54.webm
+36c35353f2c03cb099bd710d9994de7d9ed88834  vp90-2-00-quantizer-54.webm.md5
+c398ce49af762a48f10cc4da9fae0769aae5f226  vp90-2-00-quantizer-55.webm
+2cf3570542d984f167ab087f59493c7fb47e0ed2  vp90-2-00-quantizer-55.webm.md5
+3071f18b2fce261aa82d61f81a7ae4ca9a75d0e3  vp90-2-00-quantizer-56.webm
+d3f93f8272b6de31cffb011a26f11abb514efb12  vp90-2-00-quantizer-56.webm.md5
+f4e8e14b1f278801a7eb6f11734780a01b1668e9  vp90-2-00-quantizer-57.webm
+6478fdf1d7faf6db5f19dffc5e1363af358699ee  vp90-2-00-quantizer-57.webm.md5
+307dc264f57cc618fff211fa44d7f52767ed9660  vp90-2-00-quantizer-58.webm
+cf231d4a52d492fa692ea4194ec5eb7511fec54e  vp90-2-00-quantizer-58.webm.md5
+1fd7cd596170afce2de0b1441b7674bda5723440  vp90-2-00-quantizer-59.webm
+4681f7ef96f63e085c41bb1a964b0df7e67e0b38  vp90-2-00-quantizer-59.webm.md5
+34cdcc81c0ba7085aefbb22d7b4aa9bca3dd7c62  vp90-2-00-quantizer-60.webm
+58691ef53b6b623810e2c57ded374c77535df935  vp90-2-00-quantizer-60.webm.md5
+e6e812406aab81021bb16e772c1db03f75906cb6  vp90-2-00-quantizer-61.webm
+76436eace62f08ff92b61a0845e66667a027db1b  vp90-2-00-quantizer-61.webm.md5
+84d811bceed70c950a6a08e572a6e274866e72b1  vp90-2-00-quantizer-62.webm
+2d937cc011eeddd95222b960982da5cd18db580f  vp90-2-00-quantizer-62.webm.md5
+0912b295ba0ea09359315315ffd67d22d046f883  vp90-2-00-quantizer-63.webm
+5a829031055d70565f57dbcd47a6ac33619952b3  vp90-2-00-quantizer-63.webm.md5
+0cf9e5ebe0112bdb47b5887ee5d58eb9d4727c00  vp90-2-01-sharpness-1.webm
+5a0476be4448bae8f8ca17ea236c98793a755948  vp90-2-01-sharpness-1.webm.md5
+51e02d7911810cdf5be8b68ac40aedab479a3179  vp90-2-01-sharpness-2.webm
+a0ca5bc87a5ed7c7051f59078daa0d03be1b45b6  vp90-2-01-sharpness-2.webm.md5
+0603f8ad239c07a531d948187f4dafcaf51eda8d  vp90-2-01-sharpness-3.webm
+3af8000a69c72fe77881e3176f026c2affb78cc7  vp90-2-01-sharpness-3.webm.md5
+4ca4839f48146252fb261ed88838d80211804841  vp90-2-01-sharpness-4.webm
+08832a1494f84fa9edd40e080bcf2c0e80100c76  vp90-2-01-sharpness-4.webm.md5
+95099dc8f9cbaf9b9a7dd65311923e441ff70731  vp90-2-01-sharpness-5.webm
+93ceee30c140f0b406726c0d896b9db6031c4c7f  vp90-2-01-sharpness-5.webm.md5
+ceb4116fb7b078d266d153233b6d62a255a34e4c  vp90-2-01-sharpness-6.webm
+da83efe59e537ce538e8b03a6eac63cf25849c9a  vp90-2-01-sharpness-6.webm.md5
+b5f7cd19aece3880f9d616a778e5cc24c6b9b505  vp90-2-01-sharpness-7.webm
+2957408d20deac8633941a2169f801bae6f086e1  vp90-2-01-sharpness-7.webm.md5
+ffc096c2ce1050450ad462b5fabd2a5220846319  vp90-2-02-size-08x08.webm
+e36d2ed6fa2746347710b750586aafa6a01ff3ae  vp90-2-02-size-08x08.webm.md5
+895b986f9fd55cd879472b31c6a06b82094418c8  vp90-2-02-size-08x10.webm
+079157a19137ccaebba606f2871f45a397347150  vp90-2-02-size-08x10.webm.md5
+1c5992203e62a2b83040ccbecd748b604e19f4c0  vp90-2-02-size-08x16.webm
+9aa45ffdf2078f883bbed01450031b691819c144  vp90-2-02-size-08x16.webm.md5
+d0a8953da1f85f484487408fee5da9e2a8391901  vp90-2-02-size-08x18.webm
+59a5cc17d354c6a23e5e959d666b1456a5d49c56  vp90-2-02-size-08x18.webm.md5
+1b13461a9fc65cb041bacfe4ea6f02d363397d61  vp90-2-02-size-08x32.webm
+2bdddd6878f05d37d84cde056a3f5e7f926ba3d6  vp90-2-02-size-08x32.webm.md5
+2861f0a0daadb62295b0504a1fbe5b50c79a8f59  vp90-2-02-size-08x34.webm
+6b5812cfb8a82d378ea2913bf009e93668020147  vp90-2-02-size-08x34.webm.md5
+02f948216d4246579dc53c47fe55d8fb264ba251  vp90-2-02-size-08x64.webm
+84b55fdee6d9aa820c7a8c62822446184b191767  vp90-2-02-size-08x64.webm.md5
+4b011242cbf42516efd2b197baebb61dd34562c9  vp90-2-02-size-08x66.webm
+6b1fa0a885947b3cc0fe58f75f838e662bd9bb8b  vp90-2-02-size-08x66.webm.md5
+4057796be9dd12df48ab607f502ae6aa70eeeab6  vp90-2-02-size-10x08.webm
+71c752c51aec9f48de286b93f4c20e9c11cad7d0  vp90-2-02-size-10x08.webm.md5
+6583c853fa43fc53d51743eac5f3a43a359d45d0  vp90-2-02-size-10x10.webm
+1da524d24af1944b671d4d3f2b398d6e336584c3  vp90-2-02-size-10x10.webm.md5
+ba442fc03ccd3a705c64c83b36f5ada67d198874  vp90-2-02-size-10x16.webm
+7cfd960f232c34c641a4a2a9411b6fd0efb2fc50  vp90-2-02-size-10x16.webm.md5
+cc92ed40eef14f52e4d080cb2c57939dd8326374  vp90-2-02-size-10x18.webm
+db5626275cc55ce970b91c995e74f6838d943aca  vp90-2-02-size-10x18.webm.md5
+3a93d501d22325e9fd4c9d8b82e2a432de33c351  vp90-2-02-size-10x32.webm
+5cae51b0c71cfc131651f345f87583eb2903afaf  vp90-2-02-size-10x32.webm.md5
+50d2f2b15a9a5178153db44a9e03aaf32b227f67  vp90-2-02-size-10x34.webm
+bb0efe058122641e7f73e94497dda2b9e6c21efd  vp90-2-02-size-10x34.webm.md5
+01624ec173e533e0b33fd9bdb91eb7360c7c9175  vp90-2-02-size-10x64.webm
+b9c0e3b054463546356acf5157f9be92fd34732f  vp90-2-02-size-10x64.webm.md5
+2942879baf1c09e96b14d0fc84806abfe129c706  vp90-2-02-size-10x66.webm
+bab5f539c2f91952e187456b4beafbb4c01e25ee  vp90-2-02-size-10x66.webm.md5
+88d2b63ca5e9ee163d8f20e8886f3df3ff301a66  vp90-2-02-size-16x08.webm
+7f48a0fcf8c25963f3057d7f6669c5f2415834b8  vp90-2-02-size-16x08.webm.md5
+59261eb34c15ea9b5ddd2d416215c1a8b9e6dc1f  vp90-2-02-size-16x10.webm
+73a7c209a46dd051c9f7339b6e02ccd5b3b9fc81  vp90-2-02-size-16x10.webm.md5
+066834fef9cf5b9a72932cf4dea5f253e14a976d  vp90-2-02-size-16x16.webm
+faec542f52f37601cb9c480d887ae9355be99372  vp90-2-02-size-16x16.webm.md5
+195307b4eb3192271ee4a935b0e48deef0c54cc2  vp90-2-02-size-16x18.webm
+5a92e19e624c0376321d4d0e22c0c91995bc23e1  vp90-2-02-size-16x18.webm.md5
+14f3f884216d7ae16ec521f024a2f2d31bbf9c1a  vp90-2-02-size-16x32.webm
+ea622d1c817dd174556f7ee7ccfe4942b34d4845  vp90-2-02-size-16x32.webm.md5
+2e0501100578a5da9dd47e4beea160f945bdd1ba  vp90-2-02-size-16x34.webm
+1b8645ef64239334921c5f56b24ce815e6070b05  vp90-2-02-size-16x34.webm.md5
+89a6797fbebebe93215f367229a9152277f5dcfe  vp90-2-02-size-16x64.webm
+a03d8c1179ca626a8856fb416d635dbf377979cd  vp90-2-02-size-16x64.webm.md5
+0f3a182e0750fcbae0b9eae80c7a53aabafdd18d  vp90-2-02-size-16x66.webm
+8cb6736dc2d897c1283919a32068af377d66c59c  vp90-2-02-size-16x66.webm.md5
+68fe70dc7914cc1d8d6dcd97388b79196ba3e7f1  vp90-2-02-size-18x08.webm
+874c7fb505be9db3160c57cb405c4dbd5b990dc2  vp90-2-02-size-18x08.webm.md5
+0546352dd78496d4dd86c3727ac2ff36c9e72032  vp90-2-02-size-18x10.webm
+1d80eb36557ea5f25a386495a36f93da0f25316b  vp90-2-02-size-18x10.webm.md5
+60fe99e5f5cc99706efa3e0b894e45cbcf0d6330  vp90-2-02-size-18x16.webm
+1ab6cdd89a53662995d103546e6611c84f9292ab  vp90-2-02-size-18x16.webm.md5
+f9a8f5fb749d69fd555db6ca093b7f77800c7b4f  vp90-2-02-size-18x18.webm
+ace8a66328f7802b15f9989c2720c029c6abd279  vp90-2-02-size-18x18.webm.md5
+a197123a527ec25913a9bf52dc8c347749e00045  vp90-2-02-size-18x32.webm
+34fbd7036752232d1663e70d7f7cdc93f7129202  vp90-2-02-size-18x32.webm.md5
+f219655a639a774a2c9c0a9f45c28dc0b5e75e24  vp90-2-02-size-18x34.webm
+2c4d622a9ea548791c1a07903d3702e9774388bb  vp90-2-02-size-18x34.webm.md5
+5308578da48c677d477a5404e19391d1303033c9  vp90-2-02-size-18x64.webm
+e7fd4462527bac38559518ba80e41847db880f15  vp90-2-02-size-18x64.webm.md5
+e109a7e013bd179f97e378542e1e81689ed06802  vp90-2-02-size-18x66.webm
+45c04e422fb383c1f3be04beefaa4490e83bdb1a  vp90-2-02-size-18x66.webm.md5
+38844cae5d99caf445f7de33c3ae78494ce36c01  vp90-2-02-size-32x08.webm
+ad018be39e493ca2405225034b1a5b7a42af6f3a  vp90-2-02-size-32x08.webm.md5
+7b57eaad55906f9de9903c8657a3fcb2aaf792ea  vp90-2-02-size-32x10.webm
+2294425d4e55d275af5e25a0beac9738a1b4ee73  vp90-2-02-size-32x10.webm.md5
+f47ca2ced0d47f761bb0a5fdcd911d3f450fdcc1  vp90-2-02-size-32x16.webm
+ae10981d93913f0ab1f28c1146255e01769aa8c0  vp90-2-02-size-32x16.webm.md5
+08b23ad838b6cf1fbfe3ad7e7775d95573e815fc  vp90-2-02-size-32x18.webm
+1ba76f4c4a4ac7aabfa3ce195c1b473535eb7cc8  vp90-2-02-size-32x18.webm.md5
+d5b88ae6c8c25c53dee74d9f1e6ca64244349a57  vp90-2-02-size-32x32.webm
+e39c067a8ee2da52a51641eb1cb7f8eba935eb6b  vp90-2-02-size-32x32.webm.md5
+529429920dc36bd899059fa75a767f02c8c60874  vp90-2-02-size-32x34.webm
+56888e7834f52b106e8911e3a7fc0f473b609995  vp90-2-02-size-32x34.webm.md5
+38e848e160391c2b1a55040aadde613b9f4bf15e  vp90-2-02-size-32x64.webm
+8950485fb3f68b0e8be234db860e4ec5f5490fd0  vp90-2-02-size-32x64.webm.md5
+5e8670f0b8ec9cefa8795b8959ffbe1a8e1aea94  vp90-2-02-size-32x66.webm
+225df9d7d72ec711b0b60f4aeb65311c97db054a  vp90-2-02-size-32x66.webm.md5
+695f929e2ce6fb11a1f180322d46c5cb1c97fa61  vp90-2-02-size-34x08.webm
+5bb4262030018dd01883965c6aa6070185924ef6  vp90-2-02-size-34x08.webm.md5
+5adf74ec906d2ad3f7526e06bd29f5ad7d966a90  vp90-2-02-size-34x10.webm
+71c100b437d3e8701632ae8d65c3555339b1c68f  vp90-2-02-size-34x10.webm.md5
+d0918923c987fba2d00193d83797b21289fe54aa  vp90-2-02-size-34x16.webm
+5d5a52f3535b4d2698dd3d87f4a13fdc9b57163d  vp90-2-02-size-34x16.webm.md5
+553ab0042cf87f5e668ec31b2e4b2a4b6ec196fd  vp90-2-02-size-34x18.webm
+a164c7f3c424987df2340496e6a8cf76e973f0f1  vp90-2-02-size-34x18.webm.md5
+baf3e233634f150de81c18ba5d8848068e1c3c54  vp90-2-02-size-34x32.webm
+22a79d3bd1c9b85dfe8c70bb2e19f08a92a8be03  vp90-2-02-size-34x32.webm.md5
+6d50a533774a7167350e4a7ef43c94a5622179a2  vp90-2-02-size-34x34.webm
+0c099638e79c273546523e06704553e42eb00b00  vp90-2-02-size-34x34.webm.md5
+698cdd0a5e895cc202c488675e682a8c537ede4f  vp90-2-02-size-34x64.webm
+9317b63987cddab8389510a27b86f9f3d46e3fa5  vp90-2-02-size-34x64.webm.md5
+4b5335ca06f082b6b69f584eb8e7886bdcafefd3  vp90-2-02-size-34x66.webm
+e18d68b35428f46a84a947c646804a51ef1d7cec  vp90-2-02-size-34x66.webm.md5
+a54ae7b494906ec928a876e8290e5574f2f9f6a2  vp90-2-02-size-64x08.webm
+87f9f7087b6489d45e9e4b38ede2c5aef4a4928f  vp90-2-02-size-64x08.webm.md5
+24522c70804a3c23d937df2d829ae63965b23f38  vp90-2-02-size-64x10.webm
+447ce03938ab53bffcb4a841ee0bfaa90462dcb9  vp90-2-02-size-64x10.webm.md5
+2a5035d035d214ae614af8051930690ef623989b  vp90-2-02-size-64x16.webm
+84e355761dd2e0361b904c84c52a0dd0384d89cf  vp90-2-02-size-64x16.webm.md5
+3a293ef4e270a19438e59b817fbe5f43eed4d36b  vp90-2-02-size-64x18.webm
+666824e5ba746779eb46079e0631853dcc86d48b  vp90-2-02-size-64x18.webm.md5
+ed32fae837095c9e8fc95d223ec68101812932c2  vp90-2-02-size-64x32.webm
+97086eadedce1d0d9c072b585ba7b49aec69b1e7  vp90-2-02-size-64x32.webm.md5
+696c7a7250bdfff594f4dfd88af34239092ecd00  vp90-2-02-size-64x34.webm
+253a1d38d452e7826b086846c6f872f829c276bb  vp90-2-02-size-64x34.webm.md5
+fc508e0e3c2e6872c60919a60b812c5232e9c2b0  vp90-2-02-size-64x64.webm
+2cd6ebeca0f82e9f505616825c07950371b905ab  vp90-2-02-size-64x64.webm.md5
+0f8a4fc1d6521187660425c283f08dff8c66e476  vp90-2-02-size-64x66.webm
+5806be11a1d346be235f88d3683e69f73746166c  vp90-2-02-size-64x66.webm.md5
+273b0c36e3658685cde250408a478116d7ae92f1  vp90-2-02-size-66x08.webm
+23c3cd0dca20a2f71f036e77ea92025ff4e7a298  vp90-2-02-size-66x08.webm.md5
+4844c59c3306d1e671bb0568f00e344bf797e66e  vp90-2-02-size-66x10.webm
+e041eaf6841d775f8fde8bbb4949d2733fdaab7f  vp90-2-02-size-66x10.webm.md5
+bdf3f1582b234fcd2805ffec59f9d716a2345302  vp90-2-02-size-66x16.webm
+2ec85ee18119e6798968571ea6e1b93ca386e3af  vp90-2-02-size-66x16.webm.md5
+0acce9af12b13b025d5274013da7ef6f568f075f  vp90-2-02-size-66x18.webm
+77c4d53e2a5c96b70af9d575fe6811e0f5ee627b  vp90-2-02-size-66x18.webm.md5
+682b36a25774bbdedcd603f504d18eb63f0167d4  vp90-2-02-size-66x32.webm
+53728fae2a428f16d376a29f341a64ddca97996a  vp90-2-02-size-66x32.webm.md5
+e71b70e901e29eaa6672a6aa4f37f6f5faa02bd6  vp90-2-02-size-66x34.webm
+f69a6a555e3f614b0a35f9bfc313d8ebb35bc725  vp90-2-02-size-66x34.webm.md5
+4151b8c29452d5c2266397a7b9bf688899a2937b  vp90-2-02-size-66x64.webm
+69486e7fd9e380b6c97a03d3e167affc79f73840  vp90-2-02-size-66x64.webm.md5
+68784a1ecac776fe2a3f230345af32f06f123536  vp90-2-02-size-66x66.webm
+7f008c7f48d55e652fbd6bac405b51e0015c94f2  vp90-2-02-size-66x66.webm.md5
+7e1bc449231ac1c5c2a11c9a6333b3e828763798  vp90-2-03-size-196x196.webm
+6788a561466dace32d500194bf042e19cccc35e1  vp90-2-03-size-196x196.webm.md5
+a170c9a88ec1dd854c7a471ff55fb2a97ac31870  vp90-2-03-size-196x198.webm
+6bf9d6a8e2bdc5bf4f8a78071a3fed5ca02ad6f2  vp90-2-03-size-196x198.webm.md5
+68f861d21c4c8b03d572c3d3fcd9f4fbf1f4503f  vp90-2-03-size-196x200.webm
+bbfc260b2bfd872cc6054272bb6b7f959a9e1c6e  vp90-2-03-size-196x200.webm.md5
+fc34889feeca2b7e5b27b4f1ce22d2e2b8e3e4b1  vp90-2-03-size-196x202.webm
+158ee72af578f39aad0c3b8f4cbed2fc78b57e0f  vp90-2-03-size-196x202.webm.md5
+dd28fb7247af534bdf5e6795a3ac429610489a0b  vp90-2-03-size-196x208.webm
+7546be847efce2d1c0a23f807bfb03f91b764e1e  vp90-2-03-size-196x208.webm.md5
+41d5cf5ed65b722a1b6dc035e67f978ea8ffecf8  vp90-2-03-size-196x210.webm
+9444fdf632d6a1b6143f4cb10fed8f63c1d67ec1  vp90-2-03-size-196x210.webm.md5
+5007bc618143437c009d6dde5fc2e86f72d37dc2  vp90-2-03-size-196x224.webm
+858361d8f79b44df5545feabbc9754ec9ede632f  vp90-2-03-size-196x224.webm.md5
+0bcbe357fbc776c3fa68e7117179574ed7564a44  vp90-2-03-size-196x226.webm
+72006a5f42031a43d70a2cd9fc1958962a86628f  vp90-2-03-size-196x226.webm.md5
+000239f048cceaac055558e97ef07078ebf65502  vp90-2-03-size-198x196.webm
+2d6841901b72000c5340f30be602853438c1b787  vp90-2-03-size-198x196.webm.md5
+ae75b766306a6404c3b3b35a6b6d53633c14fbdb  vp90-2-03-size-198x198.webm
+3f2544b4f3b4b643a98f2c3b15ea5826fc702fa1  vp90-2-03-size-198x198.webm.md5
+95ffd573fa84ccef1cd59e1583e6054f56a5c83d  vp90-2-03-size-198x200.webm
+5d537e3c9b9c54418c79677543454c4cda3de1af  vp90-2-03-size-198x200.webm.md5
+ecc845bf574375f469bc91bf5c75c79dc00073d6  vp90-2-03-size-198x202.webm
+1b59f5e111265615a7a459eeda8cc9045178d228  vp90-2-03-size-198x202.webm.md5
+432fb27144fe421b9f51cf44d2750a26133ed585  vp90-2-03-size-198x208.webm
+a58a67f4fb357c73ca078aeecbc0f782975630b1  vp90-2-03-size-198x208.webm.md5
+ff5058e7e6a47435046612afc8536f2040989e6f  vp90-2-03-size-198x210.webm
+18d3be7935e52217e2e9400b6f2c681a9e45dc89  vp90-2-03-size-198x210.webm.md5
+a0d55263c1ed2c03817454dd4ec4090d36dbc864  vp90-2-03-size-198x224.webm
+efa366a299817e2da51c00623b165aab9fbb8d91  vp90-2-03-size-198x224.webm.md5
+ccd142fa2920fc85bb753f049160c1c353ad1574  vp90-2-03-size-198x226.webm
+534524a0b2dbff852e0b92ef09939db072f83243  vp90-2-03-size-198x226.webm.md5
+0d483b94ed40abc8ab6e49f960432ee54ad9c7f1  vp90-2-03-size-200x196.webm
+41795f548181717906e7a504ba551f06c32102ae  vp90-2-03-size-200x196.webm.md5
+f6c2dc54e0989d50f01333fe40c91661fcbf849a  vp90-2-03-size-200x198.webm
+43df5d8c46a40089441392e6d096c588c1079a68  vp90-2-03-size-200x198.webm.md5
+2f6e9df82e44fc145f0d9212dcccbed3de605e23  vp90-2-03-size-200x200.webm
+757b2ef96b82093255725bab9690bbafe27f3caf  vp90-2-03-size-200x200.webm.md5
+40c5ea60415642a4a2e75c0d127b06309baadfab  vp90-2-03-size-200x202.webm
+3022c4a1c625b5dc04fdb1052d17d45b4171cfba  vp90-2-03-size-200x202.webm.md5
+6942ed5b27476bb8506d10e600d6ff60887780ca  vp90-2-03-size-200x208.webm
+c4ab8c66f3cf2dc8e8dd7abae9ac21f4d32cd6be  vp90-2-03-size-200x208.webm.md5
+71dbc99b83c49d1da45589b91eabb98e2f4a7b1e  vp90-2-03-size-200x210.webm
+3f0b40da7eef7974b9bc326562f251feb67d9c7c  vp90-2-03-size-200x210.webm.md5
+6b6b8489081cfefb377cc5f18eb754ec2383f655  vp90-2-03-size-200x224.webm
+a259df2ac0e294492e3f9d4315baa34cab044f04  vp90-2-03-size-200x224.webm.md5
+c9adc1c9bb07559349a0b054df4af56f7a6edbb9  vp90-2-03-size-200x226.webm
+714cec61e3575581e4f1a0e3921f4dfdbbd316c5  vp90-2-03-size-200x226.webm.md5
+f9bdc936bdf53f8be9ce78fecd41a21d31ff3943  vp90-2-03-size-202x196.webm
+5b8e2e50fcea2c43b12fc067b8a9cc117af77bda  vp90-2-03-size-202x196.webm.md5
+c7b66ea3da87613deb47ff24a111247d3c384fec  vp90-2-03-size-202x198.webm
+517e91204b25586da943556f4adc5951c9be8bee  vp90-2-03-size-202x198.webm.md5
+935ef56b01cfdb4265a7e24696645209ccb20970  vp90-2-03-size-202x200.webm
+55b8ec4a2513183144a8e27564596c06c7576fce  vp90-2-03-size-202x200.webm.md5
+849acf75e4f1d8d90046704e1103a18c64f30e35  vp90-2-03-size-202x202.webm
+c79afc6660df2824e7df314e5bfd71f0d8acf76b  vp90-2-03-size-202x202.webm.md5
+17b3a4d55576b770626ccb856b9f1a6c8f6ae476  vp90-2-03-size-202x208.webm
+0b887ff30409c58f2ccdc3bfacd6be7c69f8997a  vp90-2-03-size-202x208.webm.md5
+032d0ade4230fb2eef6d19915a7a1c9aa4a52617  vp90-2-03-size-202x210.webm
+f78f8e79533c0c88dd2bfdcec9b1c07848568ece  vp90-2-03-size-202x210.webm.md5
+915a38c31fe425d5b93c837121cfa8082f5ea5bc  vp90-2-03-size-202x224.webm
+bf52a104074d0c5942aa7a5b31e11db47e43d48e  vp90-2-03-size-202x224.webm.md5
+be5cfde35666fa435e47d544d9258215beb1cf29  vp90-2-03-size-202x226.webm
+2fa2f87502fda756b319389c8975204e130a2e3f  vp90-2-03-size-202x226.webm.md5
+15d908e97862b5b4bf295610df011fb9aa09909b  vp90-2-03-size-208x196.webm
+50c60792305d6a99be376dd596a6ff979325e6cc  vp90-2-03-size-208x196.webm.md5
+a367c7bc9fde56d6f4848cc573c7d4c1ce75e348  vp90-2-03-size-208x198.webm
+be85fb2c8d435a75484231356f07d06ebddd13cd  vp90-2-03-size-208x198.webm.md5
+05fd46deb7288e7253742091f56e54a9a441a187  vp90-2-03-size-208x200.webm
+74f8ec3b3a2fe81767ed1ab36a47bc0062d6223c  vp90-2-03-size-208x200.webm.md5
+d8985c4b386513a7385a4b3639bf91e469f1378b  vp90-2-03-size-208x202.webm
+0614a1e8d92048852adcf605a51333f5fabc7f03  vp90-2-03-size-208x202.webm.md5
+28b002242238479165ba4fb87ee6b442c64b32e4  vp90-2-03-size-208x208.webm
+37de5aca59bb900228400b0e115d3229edb9dcc0  vp90-2-03-size-208x208.webm.md5
+c545be0050c2fad7c68427dbf86c62a739e94ab3  vp90-2-03-size-208x210.webm
+d646eccb3cd578f94b54777e32b88898bef6e17a  vp90-2-03-size-208x210.webm.md5
+63a0cfe295b661026dd7b1bebb67acace1db766f  vp90-2-03-size-208x224.webm
+85c0361d93bf85a335248fef2767ff43eeef23db  vp90-2-03-size-208x224.webm.md5
+f911cc718d66e4fe8a865226088939c9eb1b7825  vp90-2-03-size-208x226.webm
+a6d583a57876e7b7ec48625b2b2cdbcf70cab837  vp90-2-03-size-208x226.webm.md5
+5bbb0f36da9a4683cf04e724124d8696332911bf  vp90-2-03-size-210x196.webm
+a3580fc7816d7fbcfb54fdba501cabbd06ba2f1d  vp90-2-03-size-210x196.webm.md5
+8db64d6f9ce36dd382013b42ae4e292deba697bc  vp90-2-03-size-210x198.webm
+eda20f8268c7f4147bead4059e9c4897e09140a9  vp90-2-03-size-210x198.webm.md5
+ce391505eeaf1d12406563101cd6b2dbbbb44bfc  vp90-2-03-size-210x200.webm
+79d73b7f623082d2a00aa33e95c79d11c7d9c3a8  vp90-2-03-size-210x200.webm.md5
+852db6fdc206e72391fc69b807f1954934679949  vp90-2-03-size-210x202.webm
+f69414c5677ed2f2b8b37ae76429e509a92276a5  vp90-2-03-size-210x202.webm.md5
+c424cc3edd2308da7d33f27acb36b54db5bf2595  vp90-2-03-size-210x208.webm
+27b18562faa1b3184256f4eae8114b539b3e9d3e  vp90-2-03-size-210x208.webm.md5
+dd029eba719d50a2851592fa8b9b2efe88904930  vp90-2-03-size-210x210.webm
+c853a1670465eaa04ca31b3511995f1b6ed4f58f  vp90-2-03-size-210x210.webm.md5
+d962e8ae676c54d0c3ea04ec7c04b37ae6a786e3  vp90-2-03-size-210x224.webm
+93b793e79d987065b39ad8e2e71244368435fc25  vp90-2-03-size-210x224.webm.md5
+3d0825fe83bcc125be1f78145ff43ca6d7588784  vp90-2-03-size-210x226.webm
+5230f31a57ca3b5311698a12035d2644533b3ec4  vp90-2-03-size-210x226.webm.md5
+6622f8bd9279e1ce45509a58a31a990052d45e14  vp90-2-03-size-224x196.webm
+65411da07f60113f2be05c807879072b161d561e  vp90-2-03-size-224x196.webm.md5
+6744ff2ee2c41eb08c62ff30880833b6d77b585b  vp90-2-03-size-224x198.webm
+46ea3641d41acd4bff347b224646c060d5620385  vp90-2-03-size-224x198.webm.md5
+8eb91f3416a1404705f370caecd74b2b458351b1  vp90-2-03-size-224x200.webm
+196aefb854c8b95b9330263d6690b7ee15693ecf  vp90-2-03-size-224x200.webm.md5
+256a5a23ef4e6d5ef2871af5afb8cd13d28cec00  vp90-2-03-size-224x202.webm
+840ad8455dcf2be378c14b007e66fa642fc8196d  vp90-2-03-size-224x202.webm.md5
+db4606480ab48b96c9a6ff5e639f1f1aea2a12e4  vp90-2-03-size-224x208.webm
+40b9801d5620467499ac70fa6b7c40aaa5e1c331  vp90-2-03-size-224x208.webm.md5
+e37159e687fe1cb24cffddfae059301adbaf4212  vp90-2-03-size-224x210.webm
+1e4acd4b6334ae260c3eed08652d0ba8122073f2  vp90-2-03-size-224x210.webm.md5
+0de1eb4bb6285ae621e4f2b613d2aa4a8c95a130  vp90-2-03-size-224x224.webm
+37db449ad86fb286c2c02d94aa8fe0379c05044a  vp90-2-03-size-224x224.webm.md5
+32ebbf903a7d7881bcfe59639f1d472371f3bf27  vp90-2-03-size-224x226.webm
+5cc3ac5dc9f6912491aa2ddac863f8187f34c569  vp90-2-03-size-224x226.webm.md5
+9480ff5c2c32b1870ac760c87514912616e6cf01  vp90-2-03-size-226x196.webm
+fe83655c0f1888f0af7b047785f01ba7ca9f1324  vp90-2-03-size-226x196.webm.md5
+09cad4221996315cdddad4e502dbfabf53ca1d6a  vp90-2-03-size-226x198.webm
+e3ddfdc650acb95adb45abd9b634e1f09ea8ac96  vp90-2-03-size-226x198.webm.md5
+c34f49d55fe39e3f0b607e3cc95e30244225cecb  vp90-2-03-size-226x200.webm
+abb83edc868a3523ccd4e5523fac2efbe7c3df1f  vp90-2-03-size-226x200.webm.md5
+d17bc08eedfc60c4c23d576a6c964a21bf854d1f  vp90-2-03-size-226x202.webm
+1d22d2d0f375251c2d5a1acb4714bc35d963865b  vp90-2-03-size-226x202.webm.md5
+9bd537c4f92a25596ccd29fedfe181feac948b92  vp90-2-03-size-226x208.webm
+6feb0e7325386275719f3511ada9e248a2ae7df4  vp90-2-03-size-226x208.webm.md5
+4487067f6cedd495b93696b44b37fe0a3e7eda14  vp90-2-03-size-226x210.webm
+49a8fa87945f47208168d541c068e78d878075d5  vp90-2-03-size-226x210.webm.md5
+559fea2f8da42b33c1aa1dbc34d1d6781009847a  vp90-2-03-size-226x224.webm
+83c6d8f2969b759e10e5c6542baca1265c874c29  vp90-2-03-size-226x224.webm.md5
+fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce  vp90-2-03-size-226x226.webm
+94ad19b8b699cea105e2ff18f0df2afd7242bcf7  vp90-2-03-size-226x226.webm.md5
diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk
index 806901d..619533a 100644
--- a/libvpx/test/test.mk
+++ b/libvpx/test/test.mk
@@ -25,6 +25,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
@@ -66,6 +68,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
@@ -227,223 +230,401 @@ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-150.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-150.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-25.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-25.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-4400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-4400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-1600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-1600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-2800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-2800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-1200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-1200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-1200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-1200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-3600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-3600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-5200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-5200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-1000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-1000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-1000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-1000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-4400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-4400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-1600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-1600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-2800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-2800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-1200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-1200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-1200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-1200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-1200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-1200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-3600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-3600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-5200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-5200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-1600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-1600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-2800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-2800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-4400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-4400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-150.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-150.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-1600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-1600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-2800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-2800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-150.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-150.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-25.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-25.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-1000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-1000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-1000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-1000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-1000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-1000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-4400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-4400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-1600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-1600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-2800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-2800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-1200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-1200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-3600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-3600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-5200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-5200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-4400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-4400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-150.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-150.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-800.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
diff --git a/libvpx/test/test_vector_test.cc b/libvpx/test/test_vector_test.cc
index d7bd184..9b0e9d5 100644
--- a/libvpx/test/test_vector_test.cc
+++ b/libvpx/test/test_vector_test.cc
@@ -60,61 +60,106 @@ const char *kVP8TestVectors[] = {
 #endif
 #if CONFIG_VP9_DECODER
 const char *kVP9TestVectors[] = {
-  "vp90-00-akiyo-200.webm", "vp90-00-akiyo-300.webm",
-  "vp90-00-akiyo-50.webm", "vp90-00-bowing-150.webm",
-  "vp90-00-bowing-25.webm", "vp90-00-bowing-400.webm",
-  "vp90-00-bus-100.webm", "vp90-00-bus-2000.webm",
-  "vp90-00-bus-300.webm", "vp90-00-bus-4400.webm",
-  "vp90-00-bus-800.webm", "vp90-00-cheer-1600.webm",
-  "vp90-00-cheer-2800.webm", "vp90-00-cheer-400.webm",
-  "vp90-00-cheer-600.webm", "vp90-00-city-1200.webm",
-  "vp90-00-city-2000.webm", "vp90-00-city-300.webm",
-  "vp90-00-city-600.webm", "vp90-00-coastguard-1200.webm",
-  "vp90-00-coastguard-200.webm", "vp90-00-coastguard-3600.webm",
-  "vp90-00-coastguard-5200.webm", "vp90-00-container-1000.webm",
-  "vp90-00-container-200.webm", "vp90-00-container-50.webm",
-  "vp90-00-deadline-1000.webm", "vp90-00-deadline-200.webm",
-  "vp90-00-deadline-50.webm", "vp90-00-flower-100.webm",
-  "vp90-00-flower-2000.webm", "vp90-00-flower-300.webm",
-  "vp90-00-flower-4400.webm", "vp90-00-flower-800.webm",
-  "vp90-00-football-1600.webm", "vp90-00-football-2800.webm",
-  "vp90-00-football-400.webm", "vp90-00-football-600.webm",
-  "vp90-00-foreman-1200.webm", "vp90-00-foreman-2000.webm",
-  "vp90-00-foreman-300.webm", "vp90-00-foreman-600.webm",
-  "vp90-00-hallmonitor-1200.webm", "vp90-00-hallmonitor-2000.webm",
-  "vp90-00-hallmonitor-300.webm", "vp90-00-hallmonitor-600.webm",
-  "vp90-00-harbour-1200.webm", "vp90-00-harbour-200.webm",
-  "vp90-00-harbour-3600.webm", "vp90-00-harbour-5200.webm",
-  "vp90-00-highway-100.webm", "vp90-00-highway-1600.webm",
-  "vp90-00-highway-2800.webm", "vp90-00-highway-50.webm",
-  "vp90-00-husky-100.webm", "vp90-00-husky-2000.webm",
-  "vp90-00-husky-300.webm", "vp90-00-husky-4400.webm",
-  "vp90-00-husky-800.webm", "vp90-00-ice-150.webm",
-  "vp90-00-ice-400.webm", "vp90-00-ice-800.webm",
-  "vp90-00-mobile-1600.webm", "vp90-00-mobile-2800.webm",
-  "vp90-00-mobile-400.webm", "vp90-00-mobile-600.webm",
-  "vp90-00-motherdaughter-100.webm", "vp90-00-motherdaughter-300.webm",
-  "vp90-00-motherdaughter-600.webm", "vp90-00-news-100.webm",
-  "vp90-00-news-300.webm", "vp90-00-news-600.webm",
-  "vp90-00-pamphlet-150.webm", "vp90-00-pamphlet-25.webm",
-  "vp90-00-pamphlet-400.webm", "vp90-00-paris-1000.webm",
-  "vp90-00-paris-200.webm", "vp90-00-paris-50.webm",
-  "vp90-00-signirene-1000.webm", "vp90-00-signirene-200.webm",
-  "vp90-00-signirene-50.webm", "vp90-00-silent-1000.webm",
-  "vp90-00-silent-200.webm", "vp90-00-silent-50.webm",
-  "vp90-00-soccer-100.webm", "vp90-00-soccer-2000.webm",
-  "vp90-00-soccer-300.webm", "vp90-00-soccer-4400.webm",
-  "vp90-00-soccer-800.webm", "vp90-00-stefan-1600.webm",
-  "vp90-00-stefan-2800.webm", "vp90-00-stefan-400.webm",
-  "vp90-00-stefan-600.webm", "vp90-00-students-100.webm",
-  "vp90-00-students-300.webm", "vp90-00-students-600.webm",
-  "vp90-00-tempete-1200.webm", "vp90-00-tempete-200.webm",
-  "vp90-00-tempete-3600.webm", "vp90-00-tempete-5200.webm",
-  "vp90-00-tennis-100.webm", "vp90-00-tennis-2000.webm",
-  "vp90-00-tennis-300.webm", "vp90-00-tennis-4400.webm",
-  "vp90-00-tennis-800.webm", "vp90-00-waterfall-150.webm",
-  "vp90-00-waterfall-200.webm", "vp90-00-waterfall-400.webm",
-  "vp90-00-waterfall-800.webm",
+  "vp90-2-00-quantizer-00.webm", "vp90-2-00-quantizer-01.webm",
+  "vp90-2-00-quantizer-02.webm", "vp90-2-00-quantizer-03.webm",
+  "vp90-2-00-quantizer-04.webm", "vp90-2-00-quantizer-05.webm",
+  "vp90-2-00-quantizer-06.webm", "vp90-2-00-quantizer-07.webm",
+  "vp90-2-00-quantizer-08.webm", "vp90-2-00-quantizer-09.webm",
+  "vp90-2-00-quantizer-10.webm", "vp90-2-00-quantizer-11.webm",
+  "vp90-2-00-quantizer-12.webm", "vp90-2-00-quantizer-13.webm",
+  "vp90-2-00-quantizer-14.webm", "vp90-2-00-quantizer-15.webm",
+  "vp90-2-00-quantizer-16.webm", "vp90-2-00-quantizer-17.webm",
+  "vp90-2-00-quantizer-18.webm", "vp90-2-00-quantizer-19.webm",
+  "vp90-2-00-quantizer-20.webm", "vp90-2-00-quantizer-21.webm",
+  "vp90-2-00-quantizer-22.webm", "vp90-2-00-quantizer-23.webm",
+  "vp90-2-00-quantizer-24.webm", "vp90-2-00-quantizer-25.webm",
+  "vp90-2-00-quantizer-26.webm", "vp90-2-00-quantizer-27.webm",
+  "vp90-2-00-quantizer-28.webm", "vp90-2-00-quantizer-29.webm",
+  "vp90-2-00-quantizer-30.webm", "vp90-2-00-quantizer-31.webm",
+  "vp90-2-00-quantizer-32.webm", "vp90-2-00-quantizer-33.webm",
+  "vp90-2-00-quantizer-34.webm", "vp90-2-00-quantizer-35.webm",
+  "vp90-2-00-quantizer-36.webm", "vp90-2-00-quantizer-37.webm",
+  "vp90-2-00-quantizer-38.webm", "vp90-2-00-quantizer-39.webm",
+  "vp90-2-00-quantizer-40.webm", "vp90-2-00-quantizer-41.webm",
+  "vp90-2-00-quantizer-42.webm", "vp90-2-00-quantizer-43.webm",
+  "vp90-2-00-quantizer-44.webm", "vp90-2-00-quantizer-45.webm",
+  "vp90-2-00-quantizer-46.webm", "vp90-2-00-quantizer-47.webm",
+  "vp90-2-00-quantizer-48.webm", "vp90-2-00-quantizer-49.webm",
+  "vp90-2-00-quantizer-50.webm", "vp90-2-00-quantizer-51.webm",
+  "vp90-2-00-quantizer-52.webm", "vp90-2-00-quantizer-53.webm",
+  "vp90-2-00-quantizer-54.webm", "vp90-2-00-quantizer-55.webm",
+  "vp90-2-00-quantizer-56.webm", "vp90-2-00-quantizer-57.webm",
+  "vp90-2-00-quantizer-58.webm", "vp90-2-00-quantizer-59.webm",
+  "vp90-2-00-quantizer-60.webm", "vp90-2-00-quantizer-61.webm",
+  "vp90-2-00-quantizer-62.webm", "vp90-2-00-quantizer-63.webm",
+  "vp90-2-01-sharpness-1.webm", "vp90-2-01-sharpness-2.webm",
+  "vp90-2-01-sharpness-3.webm", "vp90-2-01-sharpness-4.webm",
+  "vp90-2-01-sharpness-5.webm", "vp90-2-01-sharpness-6.webm",
+  "vp90-2-01-sharpness-7.webm", "vp90-2-02-size-08x08.webm",
+  "vp90-2-02-size-08x10.webm", "vp90-2-02-size-08x16.webm",
+  "vp90-2-02-size-08x18.webm", "vp90-2-02-size-08x32.webm",
+  "vp90-2-02-size-08x34.webm", "vp90-2-02-size-08x64.webm",
+  "vp90-2-02-size-08x66.webm", "vp90-2-02-size-10x08.webm",
+  "vp90-2-02-size-10x10.webm", "vp90-2-02-size-10x16.webm",
+  "vp90-2-02-size-10x18.webm", "vp90-2-02-size-10x32.webm",
+  "vp90-2-02-size-10x34.webm", "vp90-2-02-size-10x64.webm",
+  "vp90-2-02-size-10x66.webm", "vp90-2-02-size-16x08.webm",
+  "vp90-2-02-size-16x10.webm", "vp90-2-02-size-16x16.webm",
+  "vp90-2-02-size-16x18.webm", "vp90-2-02-size-16x32.webm",
+  "vp90-2-02-size-16x34.webm", "vp90-2-02-size-16x64.webm",
+  "vp90-2-02-size-16x66.webm", "vp90-2-02-size-18x08.webm",
+  "vp90-2-02-size-18x10.webm", "vp90-2-02-size-18x16.webm",
+  "vp90-2-02-size-18x18.webm", "vp90-2-02-size-18x32.webm",
+  "vp90-2-02-size-18x34.webm", "vp90-2-02-size-18x64.webm",
+  "vp90-2-02-size-18x66.webm", "vp90-2-02-size-32x08.webm",
+  "vp90-2-02-size-32x10.webm", "vp90-2-02-size-32x16.webm",
+  "vp90-2-02-size-32x18.webm", "vp90-2-02-size-32x32.webm",
+  "vp90-2-02-size-32x34.webm", "vp90-2-02-size-32x64.webm",
+  "vp90-2-02-size-32x66.webm", "vp90-2-02-size-34x08.webm",
+  "vp90-2-02-size-34x10.webm", "vp90-2-02-size-34x16.webm",
+  "vp90-2-02-size-34x18.webm", "vp90-2-02-size-34x32.webm",
+  "vp90-2-02-size-34x34.webm", "vp90-2-02-size-34x64.webm",
+  "vp90-2-02-size-34x66.webm", "vp90-2-02-size-64x08.webm",
+  "vp90-2-02-size-64x10.webm", "vp90-2-02-size-64x16.webm",
+  "vp90-2-02-size-64x18.webm", "vp90-2-02-size-64x32.webm",
+  "vp90-2-02-size-64x34.webm", "vp90-2-02-size-64x64.webm",
+  "vp90-2-02-size-64x66.webm", "vp90-2-02-size-66x08.webm",
+  "vp90-2-02-size-66x10.webm", "vp90-2-02-size-66x16.webm",
+  "vp90-2-02-size-66x18.webm", "vp90-2-02-size-66x32.webm",
+  "vp90-2-02-size-66x34.webm", "vp90-2-02-size-66x64.webm",
+  "vp90-2-02-size-66x66.webm", "vp90-2-03-size-196x196.webm",
+  "vp90-2-03-size-196x198.webm", "vp90-2-03-size-196x200.webm",
+  "vp90-2-03-size-196x202.webm", "vp90-2-03-size-196x208.webm",
+  "vp90-2-03-size-196x210.webm", "vp90-2-03-size-196x224.webm",
+  "vp90-2-03-size-196x226.webm", "vp90-2-03-size-198x196.webm",
+  "vp90-2-03-size-198x198.webm", "vp90-2-03-size-198x200.webm",
+  "vp90-2-03-size-198x202.webm", "vp90-2-03-size-198x208.webm",
+  "vp90-2-03-size-198x210.webm", "vp90-2-03-size-198x224.webm",
+  "vp90-2-03-size-198x226.webm", "vp90-2-03-size-200x196.webm",
+  "vp90-2-03-size-200x198.webm", "vp90-2-03-size-200x200.webm",
+  "vp90-2-03-size-200x202.webm", "vp90-2-03-size-200x208.webm",
+  "vp90-2-03-size-200x210.webm", "vp90-2-03-size-200x224.webm",
+  "vp90-2-03-size-200x226.webm", "vp90-2-03-size-202x196.webm",
+  "vp90-2-03-size-202x198.webm", "vp90-2-03-size-202x200.webm",
+  "vp90-2-03-size-202x202.webm", "vp90-2-03-size-202x208.webm",
+  "vp90-2-03-size-202x210.webm", "vp90-2-03-size-202x224.webm",
+  "vp90-2-03-size-202x226.webm", "vp90-2-03-size-208x196.webm",
+  "vp90-2-03-size-208x198.webm", "vp90-2-03-size-208x200.webm",
+  "vp90-2-03-size-208x202.webm", "vp90-2-03-size-208x208.webm",
+  "vp90-2-03-size-208x210.webm", "vp90-2-03-size-208x224.webm",
+  "vp90-2-03-size-208x226.webm", "vp90-2-03-size-210x196.webm",
+  "vp90-2-03-size-210x198.webm", "vp90-2-03-size-210x200.webm",
+  "vp90-2-03-size-210x202.webm", "vp90-2-03-size-210x208.webm",
+  "vp90-2-03-size-210x210.webm", "vp90-2-03-size-210x224.webm",
+  "vp90-2-03-size-210x226.webm", "vp90-2-03-size-224x196.webm",
+  "vp90-2-03-size-224x198.webm", "vp90-2-03-size-224x200.webm",
+  "vp90-2-03-size-224x202.webm", "vp90-2-03-size-224x208.webm",
+  "vp90-2-03-size-224x210.webm", "vp90-2-03-size-224x224.webm",
+  "vp90-2-03-size-224x226.webm", "vp90-2-03-size-226x196.webm",
+  "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
+  "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
+  "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
+  "vp90-2-03-size-226x226.webm"
 };
 #endif
 
@@ -136,6 +181,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 
   virtual void DecompressedFrameHook(const vpx_image_t& img,
                                      const unsigned int frame_number) {
+    ASSERT_TRUE(md5_file_ != NULL);
     char expected_md5[33];
     char junk[128];
 
diff --git a/libvpx/test/tile_independence_test.cc b/libvpx/test/tile_independence_test.cc
index 9633ed7..403dbb6 100644
--- a/libvpx/test/tile_independence_test.cc
+++ b/libvpx/test/tile_independence_test.cc
@@ -23,10 +23,13 @@ extern "C" {
 
 namespace {
 class TileIndependenceTest : public ::libvpx_test::EncoderTest,
-    public ::libvpx_test::CodecTestWithParam<int> {
+                             public ::libvpx_test::CodecTestWithParam<int> {
  protected:
-  TileIndependenceTest() : EncoderTest(GET_PARAM(0)), n_tiles_(GET_PARAM(1)),
-      md5_fw_order_(), md5_inv_order_() {
+  TileIndependenceTest()
+      : EncoderTest(GET_PARAM(0)),
+        md5_fw_order_(),
+        md5_inv_order_(),
+        n_tiles_(GET_PARAM(1)) {
     init_flags_ = VPX_CODEC_USE_PSNR;
     vpx_codec_dec_cfg_t cfg;
     cfg.w = 704;
@@ -56,9 +59,8 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
 
   void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,
                  ::libvpx_test::MD5 *md5) {
-    const vpx_codec_err_t res =
-        dec->DecodeFrame(reinterpret_cast<uint8_t*>(pkt->data.frame.buf),
-                         pkt->data.frame.sz);
+    const vpx_codec_err_t res = dec->DecodeFrame(
+        reinterpret_cast<uint8_t*>(pkt->data.frame.buf), pkt->data.frame.sz);
     if (res != VPX_CODEC_OK) {
       abort_ = true;
       ASSERT_EQ(VPX_CODEC_OK, res);
@@ -72,11 +74,11 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
     UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
   }
 
- private:
-  int n_tiles_;
- protected:
   ::libvpx_test::MD5 md5_fw_order_, md5_inv_order_;
   ::libvpx_test::Decoder *fw_dec_, *inv_dec_;
+
+ private:
+  int n_tiles_;
 };
 
 // run an encode with 2 or 4 tiles, and do the decode both in normal and
@@ -93,7 +95,7 @@ TEST_P(TileIndependenceTest, MD5Match) {
                                      timebase.den, timebase.num, 0, 30);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
-  const char *md5_fw_str  = md5_fw_order_.Get();
+  const char *md5_fw_str = md5_fw_order_.Get();
   const char *md5_inv_str = md5_inv_order_.Get();
 
   // could use ASSERT_EQ(!memcmp(.., .., 16) here, but this gives nicer
@@ -102,7 +104,6 @@ TEST_P(TileIndependenceTest, MD5Match) {
   ASSERT_STREQ(md5_fw_str, md5_inv_str);
 }
 
-VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest,
-                          ::testing::Range(0, 2, 1));
+VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
 
 }  // namespace
diff --git a/libvpx/test/util.h b/libvpx/test/util.h
index 533a1db..4d7f3d4 100644
--- a/libvpx/test/util.h
+++ b/libvpx/test/util.h
@@ -37,7 +37,7 @@ static double compute_psnr(const vpx_image_t *img1,
                   img2->planes[VPX_PLANE_Y][i * img2->stride[VPX_PLANE_Y] + j];
       sqrerr += d * d;
     }
-  double mse = sqrerr / (width_y * height_y);
+  double mse = static_cast<double>(sqrerr) / (width_y * height_y);
   double psnr = 100.0;
   if (mse > 0.0) {
     psnr = 10 * log10(255.0 * 255.0 / mse);
diff --git a/libvpx/test/variance_test.cc b/libvpx/test/variance_test.cc
index dfa1a07..207b6e7 100644
--- a/libvpx/test/variance_test.cc
+++ b/libvpx/test/variance_test.cc
@@ -13,10 +13,12 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "test/clear_system_state.h"
+#include "test/register_state_check.h"
 
 #include "vpx/vpx_integer.h"
 #include "vpx_config.h"
 extern "C" {
+#include "vpx_mem/vpx_mem.h"
 #if CONFIG_VP8_ENCODER
 # include "vp8/common/variance.h"
 # include "vp8_rtcd.h"
@@ -26,12 +28,83 @@ extern "C" {
 # include "vp9_rtcd.h"
 #endif
 }
+#include "test/acm_random.h"
 
 namespace {
 
 using ::std::tr1::get;
 using ::std::tr1::make_tuple;
 using ::std::tr1::tuple;
+using libvpx_test::ACMRandom;
+
+static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
+                                 int l2w, int l2h, unsigned int *sse_ptr) {
+  int se = 0;
+  unsigned int sse = 0;
+  const int w = 1 << l2w, h = 1 << l2h;
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      int diff = ref[w * y + x] - src[w * y + x];
+      se += diff;
+      sse += diff * diff;
+    }
+  }
+  *sse_ptr = sse;
+  return sse - (((int64_t) se * se) >> (l2w + l2h));
+}
+
+static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
+                                        int l2w, int l2h, int xoff, int yoff,
+                                        unsigned int *sse_ptr) {
+  int se = 0;
+  unsigned int sse = 0;
+  const int w = 1 << l2w, h = 1 << l2h;
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // bilinear interpolation at a 16th pel step
+      const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+      const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+      const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+      const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+      const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+      const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+      const int r = a + (((b - a) * yoff + 8) >> 4);
+      int diff = r - src[w * y + x];
+      se += diff;
+      sse += diff * diff;
+    }
+  }
+  *sse_ptr = sse;
+  return sse - (((int64_t) se * se) >> (l2w + l2h));
+}
+
+static unsigned int subpel_avg_variance_ref(const uint8_t *ref,
+                                            const uint8_t *src,
+                                            const uint8_t *second_pred,
+                                            int l2w, int l2h,
+                                            int xoff, int yoff,
+                                            unsigned int *sse_ptr) {
+  int se = 0;
+  unsigned int sse = 0;
+  const int w = 1 << l2w, h = 1 << l2h;
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // bilinear interpolation at a 16th pel step
+      const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+      const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+      const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+      const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+      const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+      const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+      const int r = a + (((b - a) * yoff + 8) >> 4);
+      int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+      se += diff;
+      sse += diff * diff;
+    }
+  }
+  *sse_ptr = sse;
+  return sse - (((int64_t) se * se) >> (l2w + l2h));
+}
 
 template<typename VarianceFunctionType>
 class VarianceTest :
@@ -39,10 +112,13 @@ class VarianceTest :
  public:
   virtual void SetUp() {
     const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
-    width_  = get<0>(params);
-    height_ = get<1>(params);
+    log2width_  = get<0>(params);
+    width_ = 1 << log2width_;
+    log2height_ = get<1>(params);
+    height_ = 1 << log2height_;
     variance_ = get<2>(params);
 
+    rnd(ACMRandom::DeterministicSeed());
     block_size_ = width_ * height_;
     src_ = new uint8_t[block_size_];
     ref_ = new uint8_t[block_size_];
@@ -58,15 +134,16 @@ class VarianceTest :
 
  protected:
   void ZeroTest();
+  void RefTest();
   void OneQuarterTest();
 
+  ACMRandom rnd;
   uint8_t* src_;
   uint8_t* ref_;
-  int width_;
-  int height_;
+  int width_, log2width_;
+  int height_, log2height_;
   int block_size_;
   VarianceFunctionType variance_;
-
 };
 
 template<typename VarianceFunctionType>
@@ -76,24 +153,133 @@ void VarianceTest<VarianceFunctionType>::ZeroTest() {
     for (int j = 0; j <= 255; ++j) {
       memset(ref_, j, block_size_);
       unsigned int sse;
-      const unsigned int var = variance_(src_, width_, ref_, width_, &sse);
+      unsigned int var;
+      REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
       EXPECT_EQ(0u, var) << "src values: " << i << "ref values: " << j;
     }
   }
 }
 
 template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::RefTest() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size_; j++) {
+      src_[j] = rnd.Rand8();
+      ref_[j] = rnd.Rand8();
+    }
+    unsigned int sse1, sse2;
+    unsigned int var1;
+    REGISTER_STATE_CHECK(var1 = variance_(src_, width_, ref_, width_, &sse1));
+    const unsigned int var2 = variance_ref(src_, ref_, log2width_,
+                                           log2height_, &sse2);
+    EXPECT_EQ(sse1, sse2);
+    EXPECT_EQ(var1, var2);
+  }
+}
+
+template<typename VarianceFunctionType>
 void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
   memset(src_, 255, block_size_);
   const int half = block_size_ / 2;
   memset(ref_, 255, half);
   memset(ref_ + half, 0, half);
   unsigned int sse;
-  const unsigned int var = variance_(src_, width_, ref_, width_, &sse);
+  unsigned int var;
+  REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
   const unsigned int expected = block_size_ * 255 * 255 / 4;
   EXPECT_EQ(expected, var);
 }
 
+template<typename SubpelVarianceFunctionType>
+class SubpelVarianceTest :
+    public ::testing::TestWithParam<tuple<int, int,
+                                          SubpelVarianceFunctionType> > {
+ public:
+  virtual void SetUp() {
+    const tuple<int, int, SubpelVarianceFunctionType>& params =
+        this->GetParam();
+    log2width_  = get<0>(params);
+    width_ = 1 << log2width_;
+    log2height_ = get<1>(params);
+    height_ = 1 << log2height_;
+    subpel_variance_ = get<2>(params);
+
+    rnd(ACMRandom::DeterministicSeed());
+    block_size_ = width_ * height_;
+    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
+    sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
+    ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+    ASSERT_TRUE(src_ != NULL);
+    ASSERT_TRUE(sec_ != NULL);
+    ASSERT_TRUE(ref_ != NULL);
+  }
+
+  virtual void TearDown() {
+    vpx_free(src_);
+    delete[] ref_;
+    vpx_free(sec_);
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RefTest();
+
+  ACMRandom rnd;
+  uint8_t *src_;
+  uint8_t *ref_;
+  uint8_t *sec_;
+  int width_, log2width_;
+  int height_, log2height_;
+  int block_size_;
+  SubpelVarianceFunctionType subpel_variance_;
+};
+
+template<typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
+  for (int x = 0; x < 16; ++x) {
+    for (int y = 0; y < 16; ++y) {
+      for (int j = 0; j < block_size_; j++) {
+        src_[j] = rnd.Rand8();
+      }
+      for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+        ref_[j] = rnd.Rand8();
+      }
+      unsigned int sse1, sse2;
+      unsigned int var1;
+      REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,
+                                                   src_, width_, &sse1));
+      const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_,
+                                                    log2height_, x, y, &sse2);
+      EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+      EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+    }
+  }
+}
+
+template<>
+void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
+  for (int x = 0; x < 16; ++x) {
+    for (int y = 0; y < 16; ++y) {
+      for (int j = 0; j < block_size_; j++) {
+        src_[j] = rnd.Rand8();
+        sec_[j] = rnd.Rand8();
+      }
+      for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+        ref_[j] = rnd.Rand8();
+      }
+      unsigned int sse1, sse2;
+      unsigned int var1;
+      REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,
+                                                   src_, width_, &sse1, sec_));
+      const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_,
+                                                        log2width_, log2height_,
+                                                        x, y, &sse2);
+      EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+      EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+    }
+  }
+}
+
 // -----------------------------------------------------------------------------
 // VP8 test cases.
 
@@ -103,6 +289,7 @@ namespace vp8 {
 typedef VarianceTest<vp8_variance_fn_t> VP8VarianceTest;
 
 TEST_P(VP8VarianceTest, Zero) { ZeroTest(); }
+TEST_P(VP8VarianceTest, Ref) { RefTest(); }
 TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); }
 
 const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c;
@@ -112,11 +299,11 @@ const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c;
 const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c;
 INSTANTIATE_TEST_CASE_P(
     C, VP8VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance4x4_c),
-                      make_tuple(8, 8, variance8x8_c),
-                      make_tuple(8, 16, variance8x16_c),
-                      make_tuple(16, 8, variance16x8_c),
-                      make_tuple(16, 16, variance16x16_c)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_c),
+                      make_tuple(3, 3, variance8x8_c),
+                      make_tuple(3, 4, variance8x16_c),
+                      make_tuple(4, 3, variance16x8_c),
+                      make_tuple(4, 4, variance16x16_c)));
 
 #if HAVE_MMX
 const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;
@@ -126,11 +313,11 @@ const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx;
 const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx;
 INSTANTIATE_TEST_CASE_P(
     MMX, VP8VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance4x4_mmx),
-                      make_tuple(8, 8, variance8x8_mmx),
-                      make_tuple(8, 16, variance8x16_mmx),
-                      make_tuple(16, 8, variance16x8_mmx),
-                      make_tuple(16, 16, variance16x16_mmx)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_mmx),
+                      make_tuple(3, 3, variance8x8_mmx),
+                      make_tuple(3, 4, variance8x16_mmx),
+                      make_tuple(4, 3, variance16x8_mmx),
+                      make_tuple(4, 4, variance16x16_mmx)));
 #endif
 
 #if HAVE_SSE2
@@ -141,11 +328,11 @@ const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt;
 const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt;
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP8VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance4x4_wmt),
-                      make_tuple(8, 8, variance8x8_wmt),
-                      make_tuple(8, 16, variance8x16_wmt),
-                      make_tuple(16, 8, variance16x8_wmt),
-                      make_tuple(16, 16, variance16x16_wmt)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_wmt),
+                      make_tuple(3, 3, variance8x8_wmt),
+                      make_tuple(3, 4, variance8x16_wmt),
+                      make_tuple(4, 3, variance16x8_wmt),
+                      make_tuple(4, 4, variance16x16_wmt)));
 #endif
 #endif  // CONFIG_VP8_ENCODER
 
@@ -158,22 +345,127 @@ namespace vp9 {
 
 #if CONFIG_VP9_ENCODER
 typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
+typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
+typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
 
 TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
+TEST_P(VP9VarianceTest, Ref) { RefTest(); }
+TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
 TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
 
 const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
+const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c;
+const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c;
 const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c;
 const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c;
 const vp9_variance_fn_t variance16x8_c = vp9_variance16x8_c;
 const vp9_variance_fn_t variance16x16_c = vp9_variance16x16_c;
+const vp9_variance_fn_t variance16x32_c = vp9_variance16x32_c;
+const vp9_variance_fn_t variance32x16_c = vp9_variance32x16_c;
+const vp9_variance_fn_t variance32x32_c = vp9_variance32x32_c;
+const vp9_variance_fn_t variance32x64_c = vp9_variance32x64_c;
+const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c;
+const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c;
 INSTANTIATE_TEST_CASE_P(
     C, VP9VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance4x4_c),
-                      make_tuple(8, 8, variance8x8_c),
-                      make_tuple(8, 16, variance8x16_c),
-                      make_tuple(16, 8, variance16x8_c),
-                      make_tuple(16, 16, variance16x16_c)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_c),
+                      make_tuple(2, 3, variance4x8_c),
+                      make_tuple(3, 2, variance8x4_c),
+                      make_tuple(3, 3, variance8x8_c),
+                      make_tuple(3, 4, variance8x16_c),
+                      make_tuple(4, 3, variance16x8_c),
+                      make_tuple(4, 4, variance16x16_c),
+                      make_tuple(4, 5, variance16x32_c),
+                      make_tuple(5, 4, variance32x16_c),
+                      make_tuple(5, 5, variance32x32_c),
+                      make_tuple(5, 6, variance32x64_c),
+                      make_tuple(6, 5, variance64x32_c),
+                      make_tuple(6, 6, variance64x64_c)));
+
+const vp9_subpixvariance_fn_t subpel_variance4x4_c =
+    vp9_sub_pixel_variance4x4_c;
+const vp9_subpixvariance_fn_t subpel_variance4x8_c =
+    vp9_sub_pixel_variance4x8_c;
+const vp9_subpixvariance_fn_t subpel_variance8x4_c =
+    vp9_sub_pixel_variance8x4_c;
+const vp9_subpixvariance_fn_t subpel_variance8x8_c =
+    vp9_sub_pixel_variance8x8_c;
+const vp9_subpixvariance_fn_t subpel_variance8x16_c =
+    vp9_sub_pixel_variance8x16_c;
+const vp9_subpixvariance_fn_t subpel_variance16x8_c =
+    vp9_sub_pixel_variance16x8_c;
+const vp9_subpixvariance_fn_t subpel_variance16x16_c =
+    vp9_sub_pixel_variance16x16_c;
+const vp9_subpixvariance_fn_t subpel_variance16x32_c =
+    vp9_sub_pixel_variance16x32_c;
+const vp9_subpixvariance_fn_t subpel_variance32x16_c =
+    vp9_sub_pixel_variance32x16_c;
+const vp9_subpixvariance_fn_t subpel_variance32x32_c =
+    vp9_sub_pixel_variance32x32_c;
+const vp9_subpixvariance_fn_t subpel_variance32x64_c =
+    vp9_sub_pixel_variance32x64_c;
+const vp9_subpixvariance_fn_t subpel_variance64x32_c =
+    vp9_sub_pixel_variance64x32_c;
+const vp9_subpixvariance_fn_t subpel_variance64x64_c =
+    vp9_sub_pixel_variance64x64_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VP9SubpelVarianceTest,
+    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c),
+                      make_tuple(2, 3, subpel_variance4x8_c),
+                      make_tuple(3, 2, subpel_variance8x4_c),
+                      make_tuple(3, 3, subpel_variance8x8_c),
+                      make_tuple(3, 4, subpel_variance8x16_c),
+                      make_tuple(4, 3, subpel_variance16x8_c),
+                      make_tuple(4, 4, subpel_variance16x16_c),
+                      make_tuple(4, 5, subpel_variance16x32_c),
+                      make_tuple(5, 4, subpel_variance32x16_c),
+                      make_tuple(5, 5, subpel_variance32x32_c),
+                      make_tuple(5, 6, subpel_variance32x64_c),
+                      make_tuple(6, 5, subpel_variance64x32_c),
+                      make_tuple(6, 6, subpel_variance64x64_c)));
+
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
+    vp9_sub_pixel_avg_variance4x4_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =
+    vp9_sub_pixel_avg_variance4x8_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c =
+    vp9_sub_pixel_avg_variance8x4_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c =
+    vp9_sub_pixel_avg_variance8x8_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c =
+    vp9_sub_pixel_avg_variance8x16_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c =
+    vp9_sub_pixel_avg_variance16x8_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c =
+    vp9_sub_pixel_avg_variance16x16_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c =
+    vp9_sub_pixel_avg_variance16x32_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c =
+    vp9_sub_pixel_avg_variance32x16_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c =
+    vp9_sub_pixel_avg_variance32x32_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c =
+    vp9_sub_pixel_avg_variance32x64_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c =
+    vp9_sub_pixel_avg_variance64x32_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c =
+    vp9_sub_pixel_avg_variance64x64_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VP9SubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c),
+                      make_tuple(2, 3, subpel_avg_variance4x8_c),
+                      make_tuple(3, 2, subpel_avg_variance8x4_c),
+                      make_tuple(3, 3, subpel_avg_variance8x8_c),
+                      make_tuple(3, 4, subpel_avg_variance8x16_c),
+                      make_tuple(4, 3, subpel_avg_variance16x8_c),
+                      make_tuple(4, 4, subpel_avg_variance16x16_c),
+                      make_tuple(4, 5, subpel_avg_variance16x32_c),
+                      make_tuple(5, 4, subpel_avg_variance32x16_c),
+                      make_tuple(5, 5, subpel_avg_variance32x32_c),
+                      make_tuple(5, 6, subpel_avg_variance32x64_c),
+                      make_tuple(6, 5, subpel_avg_variance64x32_c),
+                      make_tuple(6, 6, subpel_avg_variance64x64_c)));
 
 #if HAVE_MMX
 const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
@@ -183,26 +475,212 @@ const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx;
 const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx;
 INSTANTIATE_TEST_CASE_P(
     MMX, VP9VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance4x4_mmx),
-                      make_tuple(8, 8, variance8x8_mmx),
-                      make_tuple(8, 16, variance8x16_mmx),
-                      make_tuple(16, 8, variance16x8_mmx),
-                      make_tuple(16, 16, variance16x16_mmx)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_mmx),
+                      make_tuple(3, 3, variance8x8_mmx),
+                      make_tuple(3, 4, variance8x16_mmx),
+                      make_tuple(4, 3, variance16x8_mmx),
+                      make_tuple(4, 4, variance16x16_mmx)));
 #endif
 
 #if HAVE_SSE2
-const vp9_variance_fn_t variance4x4_wmt = vp9_variance4x4_sse2;
-const vp9_variance_fn_t variance8x8_wmt = vp9_variance8x8_sse2;
-const vp9_variance_fn_t variance8x16_wmt = vp9_variance8x16_sse2;
-const vp9_variance_fn_t variance16x8_wmt = vp9_variance16x8_sse2;
-const vp9_variance_fn_t variance16x16_wmt = vp9_variance16x16_sse2;
+const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
+const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
+const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
+const vp9_variance_fn_t variance8x8_sse2 = vp9_variance8x8_sse2;
+const vp9_variance_fn_t variance8x16_sse2 = vp9_variance8x16_sse2;
+const vp9_variance_fn_t variance16x8_sse2 = vp9_variance16x8_sse2;
+const vp9_variance_fn_t variance16x16_sse2 = vp9_variance16x16_sse2;
+const vp9_variance_fn_t variance16x32_sse2 = vp9_variance16x32_sse2;
+const vp9_variance_fn_t variance32x16_sse2 = vp9_variance32x16_sse2;
+const vp9_variance_fn_t variance32x32_sse2 = vp9_variance32x32_sse2;
+const vp9_variance_fn_t variance32x64_sse2 = vp9_variance32x64_sse2;
+const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2;
+const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2;
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP9VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance4x4_wmt),
-                      make_tuple(8, 8, variance8x8_wmt),
-                      make_tuple(8, 16, variance8x16_wmt),
-                      make_tuple(16, 8, variance16x8_wmt),
-                      make_tuple(16, 16, variance16x16_wmt)));
+    ::testing::Values(make_tuple(2, 2, variance4x4_sse2),
+                      make_tuple(2, 3, variance4x8_sse2),
+                      make_tuple(3, 2, variance8x4_sse2),
+                      make_tuple(3, 3, variance8x8_sse2),
+                      make_tuple(3, 4, variance8x16_sse2),
+                      make_tuple(4, 3, variance16x8_sse2),
+                      make_tuple(4, 4, variance16x16_sse2),
+                      make_tuple(4, 5, variance16x32_sse2),
+                      make_tuple(5, 4, variance32x16_sse2),
+                      make_tuple(5, 5, variance32x32_sse2),
+                      make_tuple(5, 6, variance32x64_sse2),
+                      make_tuple(6, 5, variance64x32_sse2),
+                      make_tuple(6, 6, variance64x64_sse2)));
+
+const vp9_subpixvariance_fn_t subpel_variance4x4_sse =
+    vp9_sub_pixel_variance4x4_sse;
+const vp9_subpixvariance_fn_t subpel_variance4x8_sse =
+    vp9_sub_pixel_variance4x8_sse;
+const vp9_subpixvariance_fn_t subpel_variance8x4_sse2 =
+    vp9_sub_pixel_variance8x4_sse2;
+const vp9_subpixvariance_fn_t subpel_variance8x8_sse2 =
+    vp9_sub_pixel_variance8x8_sse2;
+const vp9_subpixvariance_fn_t subpel_variance8x16_sse2 =
+    vp9_sub_pixel_variance8x16_sse2;
+const vp9_subpixvariance_fn_t subpel_variance16x8_sse2 =
+    vp9_sub_pixel_variance16x8_sse2;
+const vp9_subpixvariance_fn_t subpel_variance16x16_sse2 =
+    vp9_sub_pixel_variance16x16_sse2;
+const vp9_subpixvariance_fn_t subpel_variance16x32_sse2 =
+    vp9_sub_pixel_variance16x32_sse2;
+const vp9_subpixvariance_fn_t subpel_variance32x16_sse2 =
+    vp9_sub_pixel_variance32x16_sse2;
+const vp9_subpixvariance_fn_t subpel_variance32x32_sse2 =
+    vp9_sub_pixel_variance32x32_sse2;
+const vp9_subpixvariance_fn_t subpel_variance32x64_sse2 =
+    vp9_sub_pixel_variance32x64_sse2;
+const vp9_subpixvariance_fn_t subpel_variance64x32_sse2 =
+    vp9_sub_pixel_variance64x32_sse2;
+const vp9_subpixvariance_fn_t subpel_variance64x64_sse2 =
+    vp9_sub_pixel_variance64x64_sse2;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9SubpelVarianceTest,
+    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse),
+                      make_tuple(2, 3, subpel_variance4x8_sse),
+                      make_tuple(3, 2, subpel_variance8x4_sse2),
+                      make_tuple(3, 3, subpel_variance8x8_sse2),
+                      make_tuple(3, 4, subpel_variance8x16_sse2),
+                      make_tuple(4, 3, subpel_variance16x8_sse2),
+                      make_tuple(4, 4, subpel_variance16x16_sse2),
+                      make_tuple(4, 5, subpel_variance16x32_sse2),
+                      make_tuple(5, 4, subpel_variance32x16_sse2),
+                      make_tuple(5, 5, subpel_variance32x32_sse2),
+                      make_tuple(5, 6, subpel_variance32x64_sse2),
+                      make_tuple(6, 5, subpel_variance64x32_sse2),
+                      make_tuple(6, 6, subpel_variance64x64_sse2)));
+
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =
+    vp9_sub_pixel_avg_variance4x4_sse;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =
+    vp9_sub_pixel_avg_variance4x8_sse;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 =
+    vp9_sub_pixel_avg_variance8x4_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 =
+    vp9_sub_pixel_avg_variance8x8_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 =
+    vp9_sub_pixel_avg_variance8x16_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 =
+    vp9_sub_pixel_avg_variance16x8_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 =
+    vp9_sub_pixel_avg_variance16x16_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 =
+    vp9_sub_pixel_avg_variance16x32_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 =
+    vp9_sub_pixel_avg_variance32x16_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 =
+    vp9_sub_pixel_avg_variance32x32_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 =
+    vp9_sub_pixel_avg_variance32x64_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 =
+    vp9_sub_pixel_avg_variance64x32_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 =
+    vp9_sub_pixel_avg_variance64x64_sse2;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9SubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse),
+                      make_tuple(2, 3, subpel_avg_variance4x8_sse),
+                      make_tuple(3, 2, subpel_avg_variance8x4_sse2),
+                      make_tuple(3, 3, subpel_avg_variance8x8_sse2),
+                      make_tuple(3, 4, subpel_avg_variance8x16_sse2),
+                      make_tuple(4, 3, subpel_avg_variance16x8_sse2),
+                      make_tuple(4, 4, subpel_avg_variance16x16_sse2),
+                      make_tuple(4, 5, subpel_avg_variance16x32_sse2),
+                      make_tuple(5, 4, subpel_avg_variance32x16_sse2),
+                      make_tuple(5, 5, subpel_avg_variance32x32_sse2),
+                      make_tuple(5, 6, subpel_avg_variance32x64_sse2),
+                      make_tuple(6, 5, subpel_avg_variance64x32_sse2),
+                      make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
+#endif
+
+#if HAVE_SSSE3
+const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 =
+    vp9_sub_pixel_variance4x4_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 =
+    vp9_sub_pixel_variance4x8_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance8x4_ssse3 =
+    vp9_sub_pixel_variance8x4_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance8x8_ssse3 =
+    vp9_sub_pixel_variance8x8_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance8x16_ssse3 =
+    vp9_sub_pixel_variance8x16_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance16x8_ssse3 =
+    vp9_sub_pixel_variance16x8_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance16x16_ssse3 =
+    vp9_sub_pixel_variance16x16_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance16x32_ssse3 =
+    vp9_sub_pixel_variance16x32_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance32x16_ssse3 =
+    vp9_sub_pixel_variance32x16_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance32x32_ssse3 =
+    vp9_sub_pixel_variance32x32_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance32x64_ssse3 =
+    vp9_sub_pixel_variance32x64_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance64x32_ssse3 =
+    vp9_sub_pixel_variance64x32_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance64x64_ssse3 =
+    vp9_sub_pixel_variance64x64_ssse3;
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, VP9SubpelVarianceTest,
+    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3),
+                      make_tuple(2, 3, subpel_variance4x8_ssse3),
+                      make_tuple(3, 2, subpel_variance8x4_ssse3),
+                      make_tuple(3, 3, subpel_variance8x8_ssse3),
+                      make_tuple(3, 4, subpel_variance8x16_ssse3),
+                      make_tuple(4, 3, subpel_variance16x8_ssse3),
+                      make_tuple(4, 4, subpel_variance16x16_ssse3),
+                      make_tuple(4, 5, subpel_variance16x32_ssse3),
+                      make_tuple(5, 4, subpel_variance32x16_ssse3),
+                      make_tuple(5, 5, subpel_variance32x32_ssse3),
+                      make_tuple(5, 6, subpel_variance32x64_ssse3),
+                      make_tuple(6, 5, subpel_variance64x32_ssse3),
+                      make_tuple(6, 6, subpel_variance64x64_ssse3)));
+
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =
+    vp9_sub_pixel_avg_variance4x4_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =
+    vp9_sub_pixel_avg_variance4x8_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 =
+    vp9_sub_pixel_avg_variance8x4_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 =
+    vp9_sub_pixel_avg_variance8x8_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 =
+    vp9_sub_pixel_avg_variance8x16_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 =
+    vp9_sub_pixel_avg_variance16x8_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 =
+    vp9_sub_pixel_avg_variance16x16_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 =
+    vp9_sub_pixel_avg_variance16x32_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 =
+    vp9_sub_pixel_avg_variance32x16_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 =
+    vp9_sub_pixel_avg_variance32x32_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 =
+    vp9_sub_pixel_avg_variance32x64_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 =
+    vp9_sub_pixel_avg_variance64x32_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 =
+    vp9_sub_pixel_avg_variance64x64_ssse3;
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, VP9SubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3),
+                      make_tuple(2, 3, subpel_avg_variance4x8_ssse3),
+                      make_tuple(3, 2, subpel_avg_variance8x4_ssse3),
+                      make_tuple(3, 3, subpel_avg_variance8x8_ssse3),
+                      make_tuple(3, 4, subpel_avg_variance8x16_ssse3),
+                      make_tuple(4, 3, subpel_avg_variance16x8_ssse3),
+                      make_tuple(4, 4, subpel_avg_variance16x16_ssse3),
+                      make_tuple(4, 5, subpel_avg_variance16x32_ssse3),
+                      make_tuple(5, 4, subpel_avg_variance32x16_ssse3),
+                      make_tuple(5, 5, subpel_avg_variance32x32_ssse3),
+                      make_tuple(5, 6, subpel_avg_variance32x64_ssse3),
+                      make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
+                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
 #endif
 #endif  // CONFIG_VP9_ENCODER
 
diff --git a/libvpx/test/vp9_lossless_test.cc b/libvpx/test/vp9_lossless_test.cc
new file mode 100644
index 0000000..441cc44
--- /dev/null
+++ b/libvpx/test/vp9_lossless_test.cc
@@ -0,0 +1,75 @@
+/*
+  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+
+  Use of this source code is governed by a BSD-style license
+  that can be found in the LICENSE file in the root of the source
+  tree. An additional intellectual property rights grant can be found
+  in the file PATENTS.  All contributing project authors may
+  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+
+class LossLessTest : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  LossLessTest() : EncoderTest(GET_PARAM(0)),
+                   psnr_(kMaxPsnr),
+                   nframes_(0),
+                   encoding_mode_(GET_PARAM(1)) {
+  }
+
+  virtual ~LossLessTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < psnr_)
+      psnr_= pkt->data.psnr.psnr[0];
+  }
+
+  double GetMinPsnr() const {
+      return psnr_;
+  }
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  libvpx_test::TestMode encoding_mode_;
+};
+
+TEST_P(LossLessTest, TestLossLessEncoding) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 0;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  // intentionally changed the dimension for better testing coverage
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 356, 284,
+                                     timebase.den, timebase.num, 0, 30);
+
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+VP9_INSTANTIATE_TEST_CASE(LossLessTest, ALL_TEST_MODES);
+}  // namespace
diff --git a/libvpx/test/vp9_subtract_test.cc b/libvpx/test/vp9_subtract_test.cc
new file mode 100644
index 0000000..3e5fe8d
--- /dev/null
+++ b/libvpx/test/vp9_subtract_test.cc
@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+extern "C" {
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_mem/vpx_mem.h"
+}
+
+typedef void (*subtract_fn_t)(int rows, int cols,
+                              int16_t *diff_ptr, ptrdiff_t diff_stride,
+                              const uint8_t *src_ptr, ptrdiff_t src_stride,
+                              const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+
+namespace vp9 {
+
+class VP9SubtractBlockTest : public ::testing::TestWithParam<subtract_fn_t> {
+ public:
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+};
+
+using libvpx_test::ACMRandom;
+
+TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  // FIXME(rbultje) split in its own file
+  for (BLOCK_SIZE_TYPE bsize = BLOCK_SIZE_AB4X4; bsize < BLOCK_SIZE_TYPES;
+       bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
+    const int block_width  = 4 << b_width_log2(bsize);
+    const int block_height = 4 << b_height_log2(bsize);
+    int16_t *diff = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, sizeof(*diff) * block_width * block_height * 2));
+    uint8_t *pred = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, block_width * block_height * 2));
+    uint8_t *src  = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, block_width * block_height * 2));
+
+    for (int n = 0; n < 100; n++) {
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width * 2; ++c) {
+          src[r * block_width * 2 + c] = rnd.Rand8();
+          pred[r * block_width * 2 + c] = rnd.Rand8();
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width,
+                 src, block_width, pred, block_width);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(diff[r * block_width + c],
+                    (src[r * block_width + c] -
+                     pred[r * block_width + c])) << "r = " << r
+                                                 << ", c = " << c
+                                                 << ", bs = " << bsize;
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width * 2,
+                 src, block_width * 2, pred, block_width * 2);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(diff[r * block_width * 2 + c],
+                    (src[r * block_width * 2 + c] -
+                     pred[r * block_width * 2 + c])) << "r = " << r
+                                                     << ", c = " << c
+                                                     << ", bs = " << bsize;
+        }
+      }
+    }
+    vpx_free(diff);
+    vpx_free(pred);
+    vpx_free(src);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
+                        ::testing::Values(vp9_subtract_block_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
+                        ::testing::Values(vp9_subtract_block_sse2));
+#endif
+
+}  // namespace vp9
diff --git a/libvpx/test/webm_video_source.h b/libvpx/test/webm_video_source.h
index c7919a9..9fc8545 100644
--- a/libvpx/test/webm_video_source.h
+++ b/libvpx/test/webm_video_source.h
@@ -99,7 +99,7 @@ class WebMVideoSource : public CompressedVideoSource {
 
   virtual void Begin() {
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
         << file_name_;
 
     nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb,
@@ -130,6 +130,7 @@ class WebMVideoSource : public CompressedVideoSource {
   }
 
   void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
     if (chunk_ >= chunks_) {
       unsigned int track;
 
diff --git a/libvpx/third_party/libyuv/source/scale.c b/libvpx/third_party/libyuv/source/scale.c
index 72a817d..3c30b55 100644
--- a/libvpx/third_party/libyuv/source/scale.c
+++ b/libvpx/third_party/libyuv/source/scale.c
@@ -1370,12 +1370,12 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    shr        eax, 1
     cmp        eax, 0
     je         xloop1
-    cmp        eax, 128
+    cmp        eax, 64
     je         xloop2
 
-    shr        eax, 1
     mov        ah,al
     neg        al
     add        al, 128
@@ -2132,11 +2132,11 @@ void ScaleFilterRows_SSSE3(uint8* dst_ptr,
     "mov    0x14(%esp),%edx                    \n"
     "mov    0x18(%esp),%ecx                    \n"
     "mov    0x1c(%esp),%eax                    \n"
+    "shr    %eax                               \n"
     "cmp    $0x0,%eax                          \n"
     "je     2f                                 \n"
-    "cmp    $0x80,%eax                         \n"
+    "cmp    $0x40,%eax                         \n"
     "je     3f                                 \n"
-    "shr    %eax                               \n"
     "mov    %al,%ah                            \n"
     "neg    %al                                \n"
     "add    $0x80,%al                          \n"
@@ -2662,6 +2662,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
 static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
                                   const uint8* src_ptr, int src_stride,
                                   int dst_width, int source_y_fraction) {
+  source_y_fraction >>= 1;
   if (source_y_fraction == 0) {
     asm volatile (
    "1:"
@@ -2680,7 +2681,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
       : "memory", "cc", "rax"
     );
     return;
-  } else if (source_y_fraction == 128) {
+  } else if (source_y_fraction == 64) {
     asm volatile (
     "1:"
       "movdqa     (%1),%%xmm0                  \n"
@@ -2703,7 +2704,6 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
   } else {
     asm volatile (
       "mov        %3,%%eax                     \n"
-      "shr        %%eax                        \n"
       "mov        %%al,%%ah                    \n"
       "neg        %%al                         \n"
       "add        $0x80,%%al                   \n"
diff --git a/libvpx/vp8/common/alloccommon.c b/libvpx/vp8/common/alloccommon.c
index 8af9e90..54afc13 100644
--- a/libvpx/vp8/common/alloccommon.c
+++ b/libvpx/vp8/common/alloccommon.c
@@ -173,7 +173,6 @@ void vp8_create_common(VP8_COMMON *oci)
     oci->use_bilinear_mc_filter = 0;
     oci->full_pixel = 0;
     oci->multi_token_partition = ONE_PARTITION;
-    oci->clr_type = REG_YUV;
     oci->clamp_type = RECON_CLAMP_REQUIRED;
 
     /* Initialize reference frame sign bias structure to defaults */
diff --git a/libvpx/vp8/common/onyxc_int.h b/libvpx/vp8/common/onyxc_int.h
index 276dd72..e9bb7af 100644
--- a/libvpx/vp8/common/onyxc_int.h
+++ b/libvpx/vp8/common/onyxc_int.h
@@ -72,7 +72,6 @@ typedef struct VP8Common
     int horiz_scale;
     int vert_scale;
 
-    YUV_TYPE clr_type;
     CLAMP_TYPE  clamp_type;
 
     YV12_BUFFER_CONFIG *frame_to_show;
@@ -115,9 +114,6 @@ typedef struct VP8Common
     int uvdc_delta_q;
     int uvac_delta_q;
 
-    unsigned int frames_since_golden;
-    unsigned int frames_till_alt_ref_frame;
-
     /* We allocate a MODE_INFO struct for each macroblock, together with
        an extra row on top and column on the left to simplify prediction. */
 
@@ -157,7 +153,6 @@ typedef struct VP8Common
 
     unsigned int current_video_frame;
 
-    int near_boffset[3];
     int version;
 
     TOKEN_PARTITION multi_token_partition;
@@ -165,8 +160,10 @@ typedef struct VP8Common
 #ifdef PACKET_TESTING
     VP8_HEADER oh;
 #endif
+#if CONFIG_POSTPROC_VISUALIZER
     double bitrate;
     double framerate;
+#endif
 
 #if CONFIG_MULTITHREAD
     int processor_core_count;
diff --git a/libvpx/vp8/common/postproc.c b/libvpx/vp8/common/postproc.c
index 0266f4c..dd998f1 100644
--- a/libvpx/vp8/common/postproc.c
+++ b/libvpx/vp8/common/postproc.c
@@ -923,7 +923,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
     if (flags & VP8D_DEBUG_TXT_RATE_INFO)
     {
         char message[512];
-        sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
+        sprintf(message, "Bitrate: %10.2f framerate: %10.2f ", oci->bitrate, oci->framerate);
         vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
     }
 
diff --git a/libvpx/vp8/common/vp8_asm_com_offsets.c b/libvpx/vp8/common/vp8_asm_com_offsets.c
deleted file mode 100644
index 7bab90f..0000000
--- a/libvpx/vp8/common/vp8_asm_com_offsets.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vpx/vpx_codec.h"
-#include "vpx_ports/asm_offsets.h"
-#include "vp8/common/blockd.h"
-
-#if CONFIG_POSTPROC
-#include "postproc.h"
-#endif /* CONFIG_POSTPROC */
-
-BEGIN
-
-#if CONFIG_POSTPROC
-/* mfqe.c / filter_by_weight */
-DEFINE(MFQE_PRECISION_VAL,                      MFQE_PRECISION);
-#endif /* CONFIG_POSTPROC */
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
-
-#if HAVE_MEDIA
-/* switch case in vp8_intra4x4_predict_armv6 is based on these enumerated values */
-ct_assert(B_DC_PRED, B_DC_PRED == 0);
-ct_assert(B_TM_PRED, B_TM_PRED == 1);
-ct_assert(B_VE_PRED, B_VE_PRED == 2);
-ct_assert(B_HE_PRED, B_HE_PRED == 3);
-ct_assert(B_LD_PRED, B_LD_PRED == 4);
-ct_assert(B_RD_PRED, B_RD_PRED == 5);
-ct_assert(B_VR_PRED, B_VR_PRED == 6);
-ct_assert(B_VL_PRED, B_VL_PRED == 7);
-ct_assert(B_HD_PRED, B_HD_PRED == 8);
-ct_assert(B_HU_PRED, B_HU_PRED == 9);
-#endif
-
-#if HAVE_SSE2
-#if CONFIG_POSTPROC
-/* vp8_filter_by_weight16x16 and 8x8 */
-ct_assert(MFQE_PRECISION_VAL, MFQE_PRECISION == 4)
-#endif /* CONFIG_POSTPROC */
-#endif /* HAVE_SSE2 */
diff --git a/libvpx/vp8/decoder/dboolhuff.c b/libvpx/vp8/decoder/dboolhuff.c
index 546fb2d..0007d7a 100644
--- a/libvpx/vp8/decoder/dboolhuff.c
+++ b/libvpx/vp8/decoder/dboolhuff.c
@@ -47,8 +47,8 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
     unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1];
 
     if (br->decrypt_cb) {
-        int n = bytes_left > sizeof(decrypted) ? sizeof(decrypted) : bytes_left;
-        br->decrypt_cb(br->decrypt_state, bufptr, decrypted, n);
+        size_t n = bytes_left > sizeof(decrypted) ? sizeof(decrypted) : bytes_left;
+        br->decrypt_cb(br->decrypt_state, bufptr, decrypted, (int)n);
         bufptr = decrypted;
     }
 
diff --git a/libvpx/vp8/decoder/decodframe.c b/libvpx/vp8/decoder/decodframe.c
index 44c35ef..51eeb02 100644
--- a/libvpx/vp8/decoder/decodframe.c
+++ b/libvpx/vp8/decoder/decodframe.c
@@ -1095,7 +1095,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate bool decoder 0");
     if (pc->frame_type == KEY_FRAME) {
-        pc->clr_type    = (YUV_TYPE)vp8_read_bit(bc);
+        (void)vp8_read_bit(bc);  // colorspace
         pc->clamp_type  = (CLAMP_TYPE)vp8_read_bit(bc);
     }
 
diff --git a/libvpx/vp8/decoder/onyxd_if.c b/libvpx/vp8/decoder/onyxd_if.c
index 2db3096..2d9e343 100644
--- a/libvpx/vp8/decoder/onyxd_if.c
+++ b/libvpx/vp8/decoder/onyxd_if.c
@@ -430,7 +430,6 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_st
     *time_stamp = pbi->last_time_stamp;
     *time_end_stamp = 0;
 
-    sd->clrtype = pbi->common.clr_type;
 #if CONFIG_POSTPROC
     ret = vp8_post_proc_frame(&pbi->common, sd, flags);
 #else
diff --git a/libvpx/vp8/decoder/vp8_asm_dec_offsets.c b/libvpx/vp8/decoder/vp8_asm_dec_offsets.c
deleted file mode 100644
index 842a0d5..0000000
--- a/libvpx/vp8/decoder/vp8_asm_dec_offsets.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/asm_offsets.h"
-#include "onyxd_int.h"
-
-BEGIN
-
-DEFINE(bool_decoder_user_buffer_end,            offsetof(BOOL_DECODER, user_buffer_end));
-DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));
-DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));
-DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));
-DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
diff --git a/libvpx/vp8/encoder/bitstream.c b/libvpx/vp8/encoder/bitstream.c
index 4707ae5..5f0c1f7 100644
--- a/libvpx/vp8/encoder/bitstream.c
+++ b/libvpx/vp8/encoder/bitstream.c
@@ -1322,7 +1322,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
         vp8_start_encode(bc, cx_data, cx_data_end);
 
         /* signal clr type */
-        vp8_write_bit(bc, pc->clr_type);
+        vp8_write_bit(bc, 0);
         vp8_write_bit(bc, pc->clamp_type);
 
     }
diff --git a/libvpx/vp8/encoder/firstpass.c b/libvpx/vp8/encoder/firstpass.c
index 433726d..ded0c43 100644
--- a/libvpx/vp8/encoder/firstpass.c
+++ b/libvpx/vp8/encoder/firstpass.c
@@ -1325,7 +1325,7 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta
     return Q;
 }
 
-extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate);
+extern void vp8_new_framerate(VP8_COMP *cpi, double framerate);
 
 void vp8_init_second_pass(VP8_COMP *cpi)
 {
@@ -1349,9 +1349,9 @@ void vp8_init_second_pass(VP8_COMP *cpi)
      * sum duration is not. Its calculated based on the actual durations of
      * all frames from the first pass.
      */
-    vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);
+    vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);
 
-    cpi->output_frame_rate = cpi->frame_rate;
+    cpi->output_framerate = cpi->framerate;
     cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
     cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0);
 
@@ -2398,7 +2398,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     target_frame_size += cpi->min_frame_bandwidth;
 
     /* Every other frame gets a few extra bits */
-    if ( (cpi->common.frames_since_golden & 0x01) &&
+    if ( (cpi->frames_since_golden & 0x01) &&
          (cpi->frames_till_gf_update_due > 0) )
     {
         target_frame_size += cpi->twopass.alt_extra_bits;
@@ -2529,7 +2529,7 @@ void vp8_second_pass(VP8_COMP *cpi)
 
     /* Set nominal per second bandwidth for this frame */
     cpi->target_bandwidth = (int)
-    (cpi->per_frame_bandwidth * cpi->output_frame_rate);
+    (cpi->per_frame_bandwidth * cpi->output_framerate);
     if (cpi->target_bandwidth < 0)
         cpi->target_bandwidth = 0;
 
@@ -3185,7 +3185,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
         /* Convert to a per second bitrate */
         cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
-                                      cpi->output_frame_rate);
+                                      cpi->output_framerate);
     }
 
     /* Note the total error score of the kf group minus the key frame itself */
@@ -3224,7 +3224,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         cpi->common.vert_scale = NORMAL;
 
         /* Calculate Average bits per frame. */
-        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate);
+        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate);
 
         /* CBR... Use the clip average as the target for deciding resample */
         if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
@@ -3299,7 +3299,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         }
         else
         {
-            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
+            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate));
             int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
 
             /* If triggered last time the threshold for triggering again is
diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c
index 73f6583..7c07975 100644
--- a/libvpx/vp8/encoder/onyx_if.c
+++ b/libvpx/vp8/encoder/onyx_if.c
@@ -301,11 +301,11 @@ static int rescale(int val, int num, int denom)
 static void init_temporal_layer_context(VP8_COMP *cpi,
                                         VP8_CONFIG *oxcf,
                                         const int layer,
-                                        double prev_layer_frame_rate)
+                                        double prev_layer_framerate)
 {
     LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
-    lc->frame_rate = cpi->output_frame_rate / cpi->oxcf.rate_decimator[layer];
+    lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
     lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;
 
     lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
@@ -335,7 +335,7 @@ static void init_temporal_layer_context(VP8_COMP *cpi,
       lc->avg_frame_size_for_layer =
           (int)((cpi->oxcf.target_bitrate[layer] -
                 cpi->oxcf.target_bitrate[layer-1]) * 1000 /
-                (lc->frame_rate - prev_layer_frame_rate));
+                (lc->framerate - prev_layer_framerate));
 
      lc->active_worst_quality         = cpi->oxcf.worst_allowed_q;
      lc->active_best_quality          = cpi->oxcf.best_allowed_q;
@@ -363,7 +363,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
                                         const int prev_num_layers)
 {
     int i;
-    double prev_layer_frame_rate = 0;
+    double prev_layer_framerate = 0;
     const int curr_num_layers = cpi->oxcf.number_of_layers;
     // If the previous state was 1 layer, get current layer context from cpi.
     // We need this to set the layer context for the new layers below.
@@ -377,7 +377,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
         LAYER_CONTEXT *lc = &cpi->layer_context[i];
         if (i >= prev_num_layers)
         {
-           init_temporal_layer_context(cpi, oxcf, i, prev_layer_frame_rate);
+           init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
         }
         // The initial buffer levels are set based on their starting levels.
         // We could set the buffer levels based on the previous state (normalized
@@ -403,8 +403,8 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
             lc->bits_off_target = lc->buffer_level;
             restore_layer_context(cpi, 0);
         }
-        prev_layer_frame_rate =  cpi->output_frame_rate /
-                                 cpi->oxcf.rate_decimator[i];
+        prev_layer_framerate = cpi->output_framerate /
+                               cpi->oxcf.rate_decimator[i];
     }
 }
 
@@ -1282,21 +1282,21 @@ int vp8_reverse_trans(int x)
 
     return 63;
 }
-void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
+void vp8_new_framerate(VP8_COMP *cpi, double framerate)
 {
     if(framerate < .1)
         framerate = 30;
 
-    cpi->frame_rate             = framerate;
-    cpi->output_frame_rate      = framerate;
+    cpi->framerate              = framerate;
+    cpi->output_framerate       = framerate;
     cpi->per_frame_bandwidth    = (int)(cpi->oxcf.target_bandwidth /
-                                  cpi->output_frame_rate);
+                                  cpi->output_framerate);
     cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
     cpi->min_frame_bandwidth    = (int)(cpi->av_per_frame_bandwidth *
                                   cpi->oxcf.two_pass_vbrmin_section / 100);
 
     /* Set Maximum gf/arf interval */
-    cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);
+    cpi->max_gf_interval = ((int)(cpi->output_framerate / 2.0) + 2);
 
     if(cpi->max_gf_interval < 12)
         cpi->max_gf_interval = 12;
@@ -1337,13 +1337,13 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
      * seems like a reasonable framerate, then use that as a guess, otherwise
      * use 30.
      */
-    cpi->frame_rate = (double)(oxcf->timebase.den) /
-                      (double)(oxcf->timebase.num);
+    cpi->framerate = (double)(oxcf->timebase.den) /
+                     (double)(oxcf->timebase.num);
 
-    if (cpi->frame_rate > 180)
-        cpi->frame_rate = 30;
+    if (cpi->framerate > 180)
+        cpi->framerate = 30;
 
-    cpi->ref_frame_rate = cpi->frame_rate;
+    cpi->ref_framerate = cpi->framerate;
 
     /* change includes all joint functionality */
     vp8_change_config(cpi, oxcf);
@@ -1369,13 +1369,13 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     if (cpi->oxcf.number_of_layers > 1)
     {
         unsigned int i;
-        double prev_layer_frame_rate=0;
+        double prev_layer_framerate=0;
 
         for (i=0; i<cpi->oxcf.number_of_layers; i++)
         {
-            init_temporal_layer_context(cpi, oxcf, i, prev_layer_frame_rate);
-            prev_layer_frame_rate = cpi->output_frame_rate /
-                                    cpi->oxcf.rate_decimator[i];
+            init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+            prev_layer_framerate = cpi->output_framerate /
+                                   cpi->oxcf.rate_decimator[i];
         }
     }
 
@@ -1399,14 +1399,14 @@ static void update_layer_contexts (VP8_COMP *cpi)
     if (oxcf->number_of_layers > 1)
     {
         unsigned int i;
-        double prev_layer_frame_rate=0;
+        double prev_layer_framerate=0;
 
         for (i=0; i<oxcf->number_of_layers; i++)
         {
             LAYER_CONTEXT *lc = &cpi->layer_context[i];
 
-            lc->frame_rate =
-                cpi->ref_frame_rate / oxcf->rate_decimator[i];
+            lc->framerate =
+                cpi->ref_framerate / oxcf->rate_decimator[i];
             lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
 
             lc->starting_buffer_level = rescale(
@@ -1432,9 +1432,9 @@ static void update_layer_contexts (VP8_COMP *cpi)
                 lc->avg_frame_size_for_layer =
                    (int)((oxcf->target_bitrate[i] -
                           oxcf->target_bitrate[i-1]) * 1000 /
-                          (lc->frame_rate - prev_layer_frame_rate));
+                          (lc->framerate - prev_layer_framerate));
 
-            prev_layer_frame_rate = lc->frame_rate;
+            prev_layer_framerate = lc->framerate;
         }
     }
 }
@@ -1625,7 +1625,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
                     cpi->oxcf.target_bandwidth, 1000);
 
     /* Set up frame rate and related parameters rate control values. */
-    vp8_new_frame_rate(cpi, cpi->frame_rate);
+    vp8_new_framerate(cpi, cpi->framerate);
 
     /* Set absolute upper and lower quality limits */
     cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
@@ -1945,7 +1945,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
 
     for (i = 0; i < KEY_FRAME_CONTEXT; i++)
     {
-        cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
+        cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate;
     }
 
 #ifdef OUTPUT_YUV_SRC
@@ -2273,7 +2273,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
         {
             extern int count_mb_seg[4];
             FILE *f = fopen("modes.stt", "a");
-            double dr = (double)cpi->frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+            double dr = (double)cpi->framerate * (double)bytes * (double)8 / (double)count / (double)1000 ;
             fprintf(f, "intra_mode in Intra Frames:\n");
             fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
             fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
@@ -2750,7 +2750,7 @@ static void update_alt_ref_frame_stats(VP8_COMP *cpi)
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
     /* this frame refreshes means next frames don't unless specified by user */
-    cpi->common.frames_since_golden = 0;
+    cpi->frames_since_golden = 0;
 
     /* Clear the alternate reference update pending flag. */
     cpi->source_alt_ref_pending = 0;
@@ -2802,7 +2802,7 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
          * user
          */
         cm->refresh_golden_frame = 0;
-        cpi->common.frames_since_golden = 0;
+        cpi->frames_since_golden = 0;
 
         cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
         cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
@@ -2834,12 +2834,12 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
         if (cpi->frames_till_gf_update_due > 0)
             cpi->frames_till_gf_update_due--;
 
-        if (cpi->common.frames_till_alt_ref_frame)
-            cpi->common.frames_till_alt_ref_frame --;
+        if (cpi->frames_till_alt_ref_frame)
+            cpi->frames_till_alt_ref_frame --;
 
-        cpi->common.frames_since_golden ++;
+        cpi->frames_since_golden ++;
 
-        if (cpi->common.frames_since_golden > 1)
+        if (cpi->frames_since_golden > 1)
         {
             cpi->recent_ref_frame_usage[INTRA_FRAME] +=
                 cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME];
@@ -2890,11 +2890,11 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
             cpi->prob_last_coded = 200;
             cpi->prob_gf_coded = 1;
         }
-        else if (cpi->common.frames_since_golden == 0)
+        else if (cpi->frames_since_golden == 0)
         {
             cpi->prob_last_coded = 214;
         }
-        else if (cpi->common.frames_since_golden == 1)
+        else if (cpi->frames_since_golden == 1)
         {
             cpi->prob_last_coded = 192;
             cpi->prob_gf_coded = 220;
@@ -3368,12 +3368,12 @@ static void encode_frame_to_data_rate
             cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
             /* per second target bitrate */
             cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
-                                          cpi->output_frame_rate);
+                                          cpi->output_framerate);
         }
     }
     else
 #endif
-        cpi->per_frame_bandwidth  = (int)(cpi->target_bandwidth / cpi->output_frame_rate);
+        cpi->per_frame_bandwidth  = (int)(cpi->target_bandwidth / cpi->output_framerate);
 
     /* Default turn off buffer to buffer copying */
     cm->copy_buffer_to_gf = 0;
@@ -4557,7 +4557,7 @@ static void encode_frame_to_data_rate
         {
             LAYER_CONTEXT *lc = &cpi->layer_context[i];
             int bits_off_for_this_layer =
-               (int)(lc->target_bandwidth / lc->frame_rate -
+               (int)(lc->target_bandwidth / lc->framerate -
                      cpi->projected_frame_size);
 
             lc->bits_off_target += bits_off_for_this_layer;
@@ -4805,7 +4805,7 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
     {
         double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
             *cpi->oxcf.two_pass_vbrmin_section / 100);
-        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->frame_rate);
+        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->framerate);
     }
 }
 #endif
@@ -4821,8 +4821,10 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C
 {
 #if HAVE_NEON
     int64_t store_reg[8];
-#endif
+#if CONFIG_RUNTIME_CPU_DETECT
     VP8_COMMON            *cm = &cpi->common;
+#endif
+#endif
     struct vpx_usec_timer  timer;
     int                    res = 0;
 
@@ -4848,7 +4850,6 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C
     if(vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
                           frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL))
         res = -1;
-    cm->clr_type = sd->clrtype;
     vpx_usec_timer_mark(&timer);
     cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
 
@@ -4933,7 +4934,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                                               cpi->frames_till_gf_update_due);
                 force_src_buffer = &cpi->alt_ref_buffer;
             }
-            cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
+            cpi->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
             cm->refresh_alt_ref_frame = 1;
             cm->refresh_golden_frame = 0;
             cm->refresh_last_frame = 0;
@@ -5038,7 +5039,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
         if (this_duration)
         {
             if (step)
-                cpi->ref_frame_rate = 10000000.0 / this_duration;
+                cpi->ref_framerate = 10000000.0 / this_duration;
             else
             {
                 double avg_duration, interval;
@@ -5052,11 +5053,11 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                 if(interval > 10000000.0)
                     interval = 10000000;
 
-                avg_duration = 10000000.0 / cpi->ref_frame_rate;
+                avg_duration = 10000000.0 / cpi->ref_framerate;
                 avg_duration *= (interval - avg_duration + this_duration);
                 avg_duration /= interval;
 
-                cpi->ref_frame_rate = 10000000.0 / avg_duration;
+                cpi->ref_framerate = 10000000.0 / avg_duration;
             }
 
             if (cpi->oxcf.number_of_layers > 1)
@@ -5067,12 +5068,12 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                 for (i=0; i<cpi->oxcf.number_of_layers; i++)
                 {
                     LAYER_CONTEXT *lc = &cpi->layer_context[i];
-                    lc->frame_rate = cpi->ref_frame_rate /
-                                  cpi->oxcf.rate_decimator[i];
+                    lc->framerate = cpi->ref_framerate /
+                                    cpi->oxcf.rate_decimator[i];
                 }
             }
             else
-                vp8_new_frame_rate(cpi, cpi->ref_frame_rate);
+                vp8_new_framerate(cpi, cpi->ref_framerate);
         }
 
         cpi->last_time_stamp_seen = cpi->source->ts_start;
@@ -5089,7 +5090,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
         layer = cpi->oxcf.layer_id[
                 cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
         restore_layer_context (cpi, layer);
-        vp8_new_frame_rate (cpi, cpi->layer_context[layer].frame_rate);
+        vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
     }
 
     if (cpi->compressor_speed == 2)
diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h
index 5120fcc..3ab0fe8 100644
--- a/libvpx/vp8/encoder/onyx_int.h
+++ b/libvpx/vp8/encoder/onyx_int.h
@@ -232,7 +232,7 @@ enum
 typedef struct
 {
     /* Layer configuration */
-    double frame_rate;
+    double framerate;
     int target_bandwidth;
 
     /* Layer specific coding parameters */
@@ -320,6 +320,7 @@ typedef struct VP8_COMP
     YV12_BUFFER_CONFIG scaled_source;
     YV12_BUFFER_CONFIG *last_frame_unscaled_source;
 
+    unsigned int frames_till_alt_ref_frame;
     /* frame in src_buffers has been identified to be encoded as an alt ref */
     int source_alt_ref_pending;
     /* an alt ref frame has been encoded and is usable */
@@ -369,6 +370,7 @@ typedef struct VP8_COMP
     double key_frame_rate_correction_factor;
     double gf_rate_correction_factor;
 
+    unsigned int frames_since_golden;
     /* Count down till next GF */
     int frames_till_gf_update_due;
 
@@ -401,7 +403,7 @@ typedef struct VP8_COMP
     /* Minimum allocation that should be used for any frame */
     int min_frame_bandwidth;
     int inter_frame_target;
-    double output_frame_rate;
+    double output_framerate;
     int64_t last_time_stamp_seen;
     int64_t last_end_time_stamp_seen;
     int64_t first_time_stamp_ever;
@@ -415,8 +417,8 @@ typedef struct VP8_COMP
 
     int buffered_mode;
 
-    double frame_rate;
-    double ref_frame_rate;
+    double framerate;
+    double ref_framerate;
     int64_t buffer_level;
     int64_t bits_off_target;
 
diff --git a/libvpx/vp8/encoder/ratectrl.c b/libvpx/vp8/encoder/ratectrl.c
index 8e3c01d..1e8259c 100644
--- a/libvpx/vp8/encoder/ratectrl.c
+++ b/libvpx/vp8/encoder/ratectrl.c
@@ -234,7 +234,7 @@ void vp8_save_coding_context(VP8_COMP *cpi)
     cc->frames_since_key          = cpi->frames_since_key;
     cc->filter_level             = cpi->common.filter_level;
     cc->frames_till_gf_update_due   = cpi->frames_till_gf_update_due;
-    cc->frames_since_golden       = cpi->common.frames_since_golden;
+    cc->frames_since_golden       = cpi->frames_since_golden;
 
     vp8_copy(cc->mvc,      cpi->common.fc.mvc);
     vp8_copy(cc->mvcosts,  cpi->rd_costs.mvcosts);
@@ -271,7 +271,7 @@ void vp8_restore_coding_context(VP8_COMP *cpi)
     cpi->frames_since_key         =   cc->frames_since_key;
     cpi->common.filter_level     =   cc->filter_level;
     cpi->frames_till_gf_update_due  =   cc->frames_till_gf_update_due;
-    cpi->common.frames_since_golden       =   cc->frames_since_golden;
+    cpi->frames_since_golden       =   cc->frames_since_golden;
 
     vp8_copy(cpi->common.fc.mvc, cc->mvc);
 
@@ -388,7 +388,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
         int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
         /* Boost depends somewhat on frame rate: only used for 1 layer case. */
         if (cpi->oxcf.number_of_layers == 1) {
-          kf_boost = MAX(initial_boost, (int)(2 * cpi->output_frame_rate - 16));
+          kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
         }
         else {
           /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
@@ -399,9 +399,9 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
         kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100;
 
         /* frame separation adjustment ( down) */
-        if (cpi->frames_since_key  < cpi->output_frame_rate / 2)
+        if (cpi->frames_since_key  < cpi->output_framerate / 2)
             kf_boost = (int)(kf_boost
-                       * cpi->frames_since_key / (cpi->output_frame_rate / 2));
+                       * cpi->frames_since_key / (cpi->output_framerate / 2));
 
         /* Minimal target size is |2* per_frame_bandwidth|. */
         if (kf_boost < 16)
@@ -715,7 +715,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
                 if (Adjustment > (cpi->this_frame_target - min_frame_target))
                     Adjustment = (cpi->this_frame_target - min_frame_target);
 
-                if (cpi->common.frames_since_golden == (cpi->current_gf_interval >> 1))
+                if (cpi->frames_since_golden == (cpi->current_gf_interval >> 1))
                     cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment);
                 else
                     cpi->this_frame_target -= Adjustment;
@@ -1360,7 +1360,7 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi)
          * whichever is smaller.
          */
         int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1;
-        av_key_frame_frequency = 1 + (int)cpi->output_frame_rate * 2;
+        av_key_frame_frequency = 1 + (int)cpi->output_framerate * 2;
 
         if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
             av_key_frame_frequency = key_freq;
diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c
index 8579614..521e84f 100644
--- a/libvpx/vp8/encoder/rdopt.c
+++ b/libvpx/vp8/encoder/rdopt.c
@@ -341,7 +341,7 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue)
 
 void vp8_auto_select_speed(VP8_COMP *cpi)
 {
-    int milliseconds_for_compress = (int)(1000000 / cpi->frame_rate);
+    int milliseconds_for_compress = (int)(1000000 / cpi->framerate);
 
     milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
 
diff --git a/libvpx/vp8/vp8_common.mk b/libvpx/vp8/vp8_common.mk
index cde2651..f98eb31 100644
--- a/libvpx/vp8/vp8_common.mk
+++ b/libvpx/vp8/vp8_common.mk
@@ -66,7 +66,6 @@ VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
 VP8_COMMON_SRCS-yes += common/variance_c.c
 VP8_COMMON_SRCS-yes += common/variance.h
-VP8_COMMON_SRCS-yes += common/vp8_asm_com_offsets.c
 VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
 
 
@@ -192,7 +191,4 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance8x8_neon$(A
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
 
-$(eval $(call asm_offsets_template,\
-         vp8_asm_com_offsets.asm, $(VP8_PREFIX)common/vp8_asm_com_offsets.c))
-
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh))
diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c
index 4531d5a..9a7b9c5 100644
--- a/libvpx/vp8/vp8_cx_iface.c
+++ b/libvpx/vp8/vp8_cx_iface.c
@@ -695,7 +695,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
     yv12->uv_stride = img->stride[VPX_PLANE_U];
 
     yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
-    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
     return res;
 }
 
@@ -1079,11 +1078,7 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
         ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
         ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;
 
-        if (sd.clrtype == REG_YUV)
-            ctx->preview_img.fmt = VPX_IMG_FMT_I420;
-        else
-            ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
-
+        ctx->preview_img.fmt = VPX_IMG_FMT_I420;
         ctx->preview_img.x_chroma_shift = 1;
         ctx->preview_img.y_chroma_shift = 1;
 
diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c
index c826f69..871b8d3 100644
--- a/libvpx/vp8/vp8_dx_iface.c
+++ b/libvpx/vp8/vp8_dx_iface.c
@@ -41,15 +41,6 @@ typedef enum
 
 static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);
 
-typedef struct
-{
-    unsigned int   id;
-    unsigned long  sz;
-    unsigned int   align;
-    unsigned int   flags;
-    unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
-} mem_req_t;
-
 static const mem_req_t vp8_mem_req_segs[] =
 {
     {VP8_SEG_ALG_PRIV,    0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
@@ -93,65 +84,6 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_
     return sizeof(vpx_codec_alg_priv_t);
 }
 
-
-static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap)
-{
-    free(mmap->priv);
-}
-
-static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap)
-{
-    vpx_codec_err_t  res;
-    unsigned int   align;
-
-    align = mmap->align ? mmap->align - 1 : 0;
-
-    if (mmap->flags & VPX_CODEC_MEM_ZERO)
-        mmap->priv = calloc(1, mmap->sz + align);
-    else
-        mmap->priv = malloc(mmap->sz + align);
-
-    res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;
-    mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
-    mmap->dtor = vp8_mmap_dtor;
-    return res;
-}
-
-static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
-        const vpx_codec_mmap_t        *mmaps,
-        vpx_codec_flags_t              init_flags)
-{
-    int i;
-    vpx_codec_err_t res = VPX_CODEC_OK;
-
-    for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++)
-    {
-        /* Ensure the segment has been allocated */
-        if (!mmaps[i].base)
-        {
-            res = VPX_CODEC_MEM_ERROR;
-            break;
-        }
-
-        /* Verify variable size segment is big enough for the current si. */
-        if (vp8_mem_req_segs[i].calc_sz)
-        {
-            vpx_codec_dec_cfg_t cfg;
-
-            cfg.w = si->w;
-            cfg.h = si->h;
-
-            if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags))
-            {
-                res = VPX_CODEC_MEM_ERROR;
-                break;
-            }
-        }
-    }
-
-    return res;
-}
-
 static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
 {
     int i;
@@ -178,16 +110,6 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
     }
 }
 
-static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id)
-{
-    int i;
-
-    for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
-        if (ctx->mmaps[i].id == id)
-            return ctx->mmaps[i].base;
-
-    return NULL;
-}
 static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx)
 {
     /* nothing to clean up */
@@ -214,7 +136,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
         mmap.align = vp8_mem_req_segs[0].align;
         mmap.flags = vp8_mem_req_segs[0].flags;
 
-        res = vp8_mmap_alloc(&mmap);
+        res = vpx_mmap_alloc(&mmap);
         if (res != VPX_CODEC_OK) return res;
 
         vp8_init_ctx(ctx, &mmap);
@@ -366,8 +288,7 @@ static void yuvconfig2image(vpx_image_t               *img,
       * the Y, U, and V planes, nor other alignment adjustments that
       * might be representable by a YV12_BUFFER_CONFIG, so we just
       * initialize all the fields.*/
-    img->fmt = yv12->clrtype == REG_YUV ?
-        VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
+    img->fmt = VPX_IMG_FMT_I420;
     img->w = yv12->y_stride;
     img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
     img->d_w = yv12->y_width;
@@ -488,7 +409,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
                 ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
                                    ctx->base.init_flags);
 
-            res = vp8_mmap_alloc(&ctx->mmaps[i]);
+            res = vpx_mmap_alloc(&ctx->mmaps[i]);
         }
 
         if (!res)
@@ -500,7 +421,9 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
     /* Initialize the decoder instance on the first frame*/
     if (!res && !ctx->decoder_init)
     {
-        res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);
+        res = vpx_validate_mmaps(&ctx->si, ctx->mmaps,
+                                 vp8_mem_req_segs, NELEMENTS(vp8_mem_req_segs),
+                                 ctx->base.init_flags);
 
         if (!res)
         {
@@ -797,8 +720,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
     yv12->uv_stride = img->stride[VPX_PLANE_U];
 
     yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
-    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
-
     return res;
 }
 
diff --git a/libvpx/vp8/vp8dx.mk b/libvpx/vp8/vp8dx.mk
index c26f42d..4a8f467 100644
--- a/libvpx/vp8/vp8dx.mk
+++ b/libvpx/vp8/vp8dx.mk
@@ -35,9 +35,5 @@ VP8_DX_SRCS-yes += decoder/onyxd_int.h
 VP8_DX_SRCS-yes += decoder/treereader.h
 VP8_DX_SRCS-yes += decoder/onyxd_if.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
-VP8_DX_SRCS-yes += decoder/vp8_asm_dec_offsets.c
 
 VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
-
-$(eval $(call asm_offsets_template,\
-         vp8_asm_dec_offsets.asm, $(VP8_PREFIX)decoder/vp8_asm_dec_offsets.c))
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
new file mode 100644
index 0000000..15039e2
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
@@ -0,0 +1,277 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    ; These functions are only valid when:
+    ; x_step_q4 == 16
+    ; w%4 == 0
+    ; h%4 == 0
+    ; taps == 8
+    ; VP9_FILTER_WEIGHT == 128
+    ; VP9_FILTER_SHIFT == 7
+
+    EXPORT  |vp9_convolve8_avg_horiz_neon|
+    EXPORT  |vp9_convolve8_avg_vert_neon|
+    IMPORT  |vp9_convolve8_avg_horiz_c|
+    IMPORT  |vp9_convolve8_avg_vert_c|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Multiply and accumulate by q0
+    MACRO
+    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+    vmull.s16 $dst, $src0, d0[0]
+    vmlal.s16 $dst, $src1, d0[1]
+    vmlal.s16 $dst, $src2, d0[2]
+    vmlal.s16 $dst, $src3, d0[3]
+    vmlal.s16 $dst, $src4, d1[0]
+    vmlal.s16 $dst, $src5, d1[1]
+    vmlal.s16 $dst, $src6, d1[2]
+    vmlal.s16 $dst, $src7, d1[3]
+    MEND
+
+; r0    const uint8_t *src
+; r1    int src_stride
+; r2    uint8_t *dst
+; r3    int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4           ; unused
+; sp[]int w
+; sp[]int h
+
+|vp9_convolve8_avg_horiz_neon| PROC
+    push            {r4-r10, lr}
+
+    sub             r0, r0, #3              ; adjust for taps
+
+    ldr             r4, [sp, #36]           ; x_step_q4
+    ldr             r5, [sp, #32]           ; filter_x
+    cmp             r4, #16
+    bne             call_horiz_c_convolve   ; x_step_q4 != 16
+
+    ldr             r6, [sp, #48]           ; w
+    ldr             r7, [sp, #52]           ; h
+
+    vld1.s16        {q0}, [r5]              ; filter_x
+
+    add             r8, r1, r1, lsl #1      ; src_stride * 3
+    add             r8, r8, #4              ; src_stride * 3 + 4
+    rsb             r8, r8, #0              ; reset for src
+
+    add             r4, r3, r3, lsl #1      ; dst_stride * 3
+    sub             r4, r4, #4              ; dst_stride * 3 - 4
+    rsb             r4, r4, #0              ; reset for dst
+
+    sub             r9, r1, #8              ; post increment for src load
+
+    rsb             r1, r6, r1, lsl #2      ; reset src for outer loop
+    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
+
+    mov             r10, r6                 ; w loop counter
+
+loop_horiz
+    vld4.u8         {d24[0], d25[0], d26[0], d27[0]}, [r0]!
+    vld4.u8         {d24[4], d25[4], d26[4], d27[4]}, [r0]!
+    vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
+
+    vld4.u8         {d24[1], d25[1], d26[1], d27[1]}, [r0]!
+    vld4.u8         {d24[5], d25[5], d26[5], d27[5]}, [r0]!
+    vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
+
+    vld4.u8         {d24[2], d25[2], d26[2], d27[2]}, [r0]!
+    vld4.u8         {d24[6], d25[6], d26[6], d27[6]}, [r0]!
+    vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
+
+    vld4.u8         {d24[3], d25[3], d26[3], d27[3]}, [r0]!
+    vld4.u8         {d24[7], d25[7], d26[7], d27[7]}, [r0]!
+    vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
+
+    ; extract to s16
+    vmovl.u8        q8, d24
+    vmovl.u8        q9, d25
+    vmovl.u8        q10, d26
+    vmovl.u8        q11, d27
+    vtrn.32         d28, d29 ; only the first half is populated
+    vmovl.u8        q12, d28
+    vmovl.u8        q13, d30
+
+    ; slightly out of order load to match the existing data
+    vld1.u32        {d6[0]}, [r2], r3
+    vld1.u32        {d7[0]}, [r2], r3
+    vld1.u32        {d6[1]}, [r2], r3
+    vld1.u32        {d7[1]}, [r2], r3
+
+    sub             r2, r2, r3, lsl #2      ; reset for store
+
+    ; src[] * filter_x
+    MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
+    MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
+    MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
+    MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqshrn.u16      d2, q1, #0
+    vqshrn.u16      d3, q2, #0
+
+    ; transpose
+    vtrn.16         d2, d3
+    vtrn.32         d2, d3
+    vtrn.8          d2, d3
+    
+    ; average the new value and the dst value
+    vaddl.u8        q8, d2, d6
+    vaddl.u8        q9, d3, d7
+    vqrshrn.u16     d2, q8, #1
+    vqrshrn.u16     d3, q9, #1
+
+    vst1.u32        {d2[0]}, [r2], r3
+    vst1.u32        {d3[0]}, [r2], r3
+    vst1.u32        {d2[1]}, [r2], r3
+    vst1.u32        {d3[1]}, [r2], r4
+
+    subs            r6, r6, #4              ; w -= 4
+    bgt             loop_horiz
+
+    ; outer loop
+    mov             r6, r10                 ; restore w counter
+    add             r0, r0, r1              ; src += src_stride * 4 - w
+    add             r2, r2, r12             ; dst += dst_stride * 4 - w
+    subs            r7, r7, #4              ; h -= 4
+    bgt loop_horiz
+
+    pop             {r4-r10, pc}
+
+call_horiz_c_convolve
+    pop             {r4-r10, lr}
+    add             r0, r0, #3              ; un-adjust for taps
+    b               vp9_convolve8_avg_horiz_c
+
+
+    ENDP
+
+|vp9_convolve8_avg_vert_neon| PROC
+    push            {r4-r10, lr}
+
+    ; adjust for taps
+    sub             r0, r0, r1
+    sub             r0, r0, r1, lsl #1
+
+    ldr             r6, [sp, #44]           ; y_step_q4
+    ldr             r7, [sp, #40]           ; filter_y
+    cmp             r6, #16
+    bne             call_vert_c_convolve    ; y_step_q4 != 16
+
+    ldr             r8, [sp, #48]           ; w
+    ldr             r9, [sp, #52]           ; h
+
+    vld1.s16        {q0}, [r7]              ; filter_y
+
+    mov             r5, r1, lsl #1          ; src_stride * 2
+    add             r5, r5, r1, lsl #3      ; src_stride * 10
+    sub             r5, r5, #4              ; src_stride * 10 + 4
+    rsb             r5, r5, #0              ; reset for src
+
+    add             r6, r3, r3, lsl #1      ; dst_stride * 3
+    sub             r6, r6, #4              ; dst_stride * 3 - 4
+    rsb             r6, r6, #0              ; reset for dst
+
+    rsb             r7, r8, r1, lsl #2      ; reset src for outer loop
+    rsb             r12, r8, r3, lsl #2     ; reset dst for outer loop
+
+    mov             r10, r8                 ; w loop counter
+
+loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d16[0]}, [r0], r1
+    vld1.u32        {d16[1]}, [r0], r1
+    vld1.u32        {d18[0]}, [r0], r1
+    vld1.u32        {d18[1]}, [r0], r1
+    vld1.u32        {d20[0]}, [r0], r1
+    vld1.u32        {d20[1]}, [r0], r1
+    vld1.u32        {d22[0]}, [r0], r1
+    vld1.u32        {d22[1]}, [r0], r1
+    vld1.u32        {d24[0]}, [r0], r1
+    vld1.u32        {d24[1]}, [r0], r1
+    vld1.u32        {d26[0]}, [r0], r5
+
+    ; extract to s16
+    vmovl.u8        q8, d16
+    vmovl.u8        q9, d18
+    vmovl.u8        q10, d20
+    vmovl.u8        q11, d22
+    vmovl.u8        q12, d24
+    vmovl.u8        q13, d26
+
+    vld1.u32        {d6[0]}, [r2], r3
+    vld1.u32        {d6[1]}, [r2], r3
+    vld1.u32        {d7[0]}, [r2], r3
+    vld1.u32        {d7[1]}, [r2], r3
+
+    sub             r2, r2, r3, lsl #2      ; reset for store
+
+    ; src[] * filter_y
+    MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
+    MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
+    MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
+    MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqshrn.u16      d2, q1, #0
+    vqshrn.u16      d3, q2, #0
+
+    ; average the new value and the dst value
+    vaddl.u8        q8, d2, d6
+    vaddl.u8        q9, d3, d7
+    vqrshrn.u16     d2, q8, #1
+    vqrshrn.u16     d3, q9, #1
+
+    vst1.u32        {d2[0]}, [r2], r3
+    vst1.u32        {d2[1]}, [r2], r3
+    vst1.u32        {d3[0]}, [r2], r3
+    vst1.u32        {d3[1]}, [r2], r6
+
+    subs            r8, r8, #4              ; w -= 4
+    bgt             loop_vert
+
+    ; outer loop
+    mov             r8, r10                 ; restore w counter
+    add             r0, r0, r7              ; src += 4 * src_stride - w
+    add             r2, r2, r12             ; dst += 4 * dst_stride - w
+    subs            r9, r9, #4              ; h -= 4
+    bgt             loop_vert
+
+    pop             {r4-r10, pc}
+
+call_vert_c_convolve
+    pop             {r4-r10, lr}
+    ; un-adjust for taps
+    add             r0, r0, r1
+    add             r0, r0, r1, lsl #1
+    b               vp9_convolve8_avg_vert_c
+
+    ENDP
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
new file mode 100644
index 0000000..842c73c
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
@@ -0,0 +1,250 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    ; These functions are only valid when:
+    ; x_step_q4 == 16
+    ; w%4 == 0
+    ; h%4 == 0
+    ; taps == 8
+    ; VP9_FILTER_WEIGHT == 128
+    ; VP9_FILTER_SHIFT == 7
+
+    EXPORT  |vp9_convolve8_horiz_neon|
+    EXPORT  |vp9_convolve8_vert_neon|
+    IMPORT  |vp9_convolve8_horiz_c|
+    IMPORT  |vp9_convolve8_vert_c|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Multiply and accumulate by q0
+    MACRO
+    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+    vmull.s16 $dst, $src0, d0[0]
+    vmlal.s16 $dst, $src1, d0[1]
+    vmlal.s16 $dst, $src2, d0[2]
+    vmlal.s16 $dst, $src3, d0[3]
+    vmlal.s16 $dst, $src4, d1[0]
+    vmlal.s16 $dst, $src5, d1[1]
+    vmlal.s16 $dst, $src6, d1[2]
+    vmlal.s16 $dst, $src7, d1[3]
+    MEND
+
+; r0    const uint8_t *src
+; r1    int src_stride
+; r2    uint8_t *dst
+; r3    int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4           ; unused
+; sp[]int w
+; sp[]int h
+
+|vp9_convolve8_horiz_neon| PROC
+    push            {r4-r10, lr}
+
+    sub             r0, r0, #3              ; adjust for taps
+
+    ldr             r4, [sp, #36]           ; x_step_q4
+    ldr             r5, [sp, #32]           ; filter_x
+    cmp             r4, #16
+    bne             call_horiz_c_convolve   ; x_step_q4 != 16
+
+    ldr             r6, [sp, #48]           ; w
+    ldr             r7, [sp, #52]           ; h
+
+    vld1.s16        {q0}, [r5]              ; filter_x
+
+    add             r8, r1, r1, lsl #1      ; src_stride * 3
+    add             r8, r8, #4              ; src_stride * 3 + 4
+    rsb             r8, r8, #0              ; reset for src
+
+    add             r4, r3, r3, lsl #1      ; dst_stride * 3
+    sub             r4, r4, #4              ; dst_stride * 3 - 4
+    rsb             r4, r4, #0              ; reset for dst
+
+    sub             r9, r1, #8              ; post increment for src load
+
+    rsb             r1, r6, r1, lsl #2      ; reset src for outer loop
+    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
+
+    mov             r10, r6                 ; w loop counter
+
+loop_horiz
+    vld4.u8         {d24[0], d25[0], d26[0], d27[0]}, [r0]!
+    vld4.u8         {d24[4], d25[4], d26[4], d27[4]}, [r0]!
+    vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
+
+    vld4.u8         {d24[1], d25[1], d26[1], d27[1]}, [r0]!
+    vld4.u8         {d24[5], d25[5], d26[5], d27[5]}, [r0]!
+    vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
+
+    vld4.u8         {d24[2], d25[2], d26[2], d27[2]}, [r0]!
+    vld4.u8         {d24[6], d25[6], d26[6], d27[6]}, [r0]!
+    vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
+
+    vld4.u8         {d24[3], d25[3], d26[3], d27[3]}, [r0]!
+    vld4.u8         {d24[7], d25[7], d26[7], d27[7]}, [r0]!
+    vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
+
+    ; extract to s16
+    vmovl.u8        q8, d24
+    vmovl.u8        q9, d25
+    vmovl.u8        q10, d26
+    vmovl.u8        q11, d27
+    vtrn.32         d28, d29 ; only the first half is populated
+    vmovl.u8        q12, d28
+    vmovl.u8        q13, d30
+
+    ; src[] * filter_x
+    MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
+    MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
+    MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
+    MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqshrn.u16      d2, q1, #0
+    vqshrn.u16      d3, q2, #0
+
+    ; transpose
+    vtrn.16         d2, d3
+    vtrn.32         d2, d3
+    vtrn.8          d2, d3
+
+    vst1.u32        {d2[0]}, [r2], r3
+    vst1.u32        {d3[0]}, [r2], r3
+    vst1.u32        {d2[1]}, [r2], r3
+    vst1.u32        {d3[1]}, [r2], r4
+
+    subs            r6, r6, #4              ; w -= 4
+    bgt             loop_horiz
+
+    ; outer loop
+    mov             r6, r10                 ; restore w counter
+    add             r0, r0, r1              ; src += src_stride * 4 - w
+    add             r2, r2, r12             ; dst += dst_stride * 4 - w
+    subs            r7, r7, #4              ; h -= 4
+    bgt loop_horiz
+
+    pop             {r4-r10, pc}
+
+call_horiz_c_convolve
+    pop             {r4-r10, lr}
+    add             r0, r0, #3              ; un-adjust for taps
+    b               vp9_convolve8_horiz_c
+
+
+    ENDP
+
+|vp9_convolve8_vert_neon| PROC
+    push            {r4-r10, lr}
+
+    ; adjust for taps
+    sub             r0, r0, r1
+    sub             r0, r0, r1, lsl #1
+
+    ldr             r6, [sp, #44]           ; y_step_q4
+    ldr             r7, [sp, #40]           ; filter_y
+    cmp             r6, #16
+    bne             call_vert_c_convolve    ; y_step_q4 != 16
+
+    ldr             r8, [sp, #48]           ; w
+    ldr             r9, [sp, #52]           ; h
+
+    vld1.s16        {q0}, [r7]              ; filter_y
+
+    mov             r5, r1, lsl #1          ; src_stride * 2
+    add             r5, r5, r1, lsl #3      ; src_stride * 10
+    sub             r5, r5, #4              ; src_stride * 10 + 4
+    rsb             r5, r5, #0              ; reset for src
+
+    add             r6, r3, r3, lsl #1      ; dst_stride * 3
+    sub             r6, r6, #4              ; dst_stride * 3 - 4
+    rsb             r6, r6, #0              ; reset for dst
+
+    rsb             r7, r8, r1, lsl #2      ; reset src for outer loop
+    rsb             r12, r8, r3, lsl #2     ; reset dst for outer loop
+
+    mov             r10, r8                 ; w loop counter
+
+loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d16[0]}, [r0], r1
+    vld1.u32        {d16[1]}, [r0], r1
+    vld1.u32        {d18[0]}, [r0], r1
+    vld1.u32        {d18[1]}, [r0], r1
+    vld1.u32        {d20[0]}, [r0], r1
+    vld1.u32        {d20[1]}, [r0], r1
+    vld1.u32        {d22[0]}, [r0], r1
+    vld1.u32        {d22[1]}, [r0], r1
+    vld1.u32        {d24[0]}, [r0], r1
+    vld1.u32        {d24[1]}, [r0], r1
+    vld1.u32        {d26[0]}, [r0], r5
+
+    ; extract to s16
+    vmovl.u8        q8, d16
+    vmovl.u8        q9, d18
+    vmovl.u8        q10, d20
+    vmovl.u8        q11, d22
+    vmovl.u8        q12, d24
+    vmovl.u8        q13, d26
+
+    ; src[] * filter_y
+    MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
+    MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
+    MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
+    MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqshrn.u16      d2, q1, #0
+    vqshrn.u16      d3, q2, #0
+
+    vst1.u32        {d2[0]}, [r2], r3
+    vst1.u32        {d2[1]}, [r2], r3
+    vst1.u32        {d3[0]}, [r2], r3
+    vst1.u32        {d3[1]}, [r2], r6
+
+    subs            r8, r8, #4              ; w -= 4
+    bgt             loop_vert
+
+    ; outer loop
+    mov             r8, r10                 ; restore w counter
+    add             r0, r0, r7              ; src += 4 * src_stride - w
+    add             r2, r2, r12             ; dst += 4 * dst_stride - w
+    subs            r9, r9, #4              ; h -= 4
+    bgt             loop_vert
+
+    pop             {r4-r10, pc}
+
+call_vert_c_convolve
+    pop             {r4-r10, lr}
+    ; un-adjust for taps
+    add             r0, r0, r1
+    add             r0, r0, r1, lsl #1
+    b               vp9_convolve8_vert_c
+
+    ENDP
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c b/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
new file mode 100644
index 0000000..6e37ff6
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int x_step_q4,
+                        const int16_t *filter_y, int y_step_q4,
+                        int w, int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+   */
+  uint8_t temp[64 * 72];
+
+  // Account for the vertical phase needing 3 lines prior and 4 lines post
+  int intermediate_height = h + 7;
+
+  if (x_step_q4 != 16 || y_step_q4 != 16)
+    return vp9_convolve8_c(src, src_stride,
+                           dst, dst_stride,
+                           filter_x, x_step_q4,
+                           filter_y, y_step_q4,
+                           w, h);
+
+  /* Filter starting 3 lines back. The neon implementation will ignore the
+   * given height and filter a multiple of 4 lines. Since this goes in to
+   * the temp buffer which has lots of extra room and is subsequently discarded
+   * this is safe if somewhat less than ideal.
+   */
+  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+                           temp, 64,
+                           filter_x, x_step_q4, filter_y, y_step_q4,
+                           w, intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data */
+  vp9_convolve8_vert_neon(temp + 64 * 3, 64,
+                          dst, dst_stride,
+                          filter_x, x_step_q4, filter_y, y_step_q4,
+                          w, h);
+}
+
+void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h) {
+  uint8_t temp[64 * 72];
+  int intermediate_height = h + 7;
+
+  if (x_step_q4 != 16 || y_step_q4 != 16)
+    return vp9_convolve8_avg_c(src, src_stride,
+                               dst, dst_stride,
+                               filter_x, x_step_q4,
+                               filter_y, y_step_q4,
+                               w, h);
+
+  /* This implementation has the same issues as above. In addition, we only want
+   * to average the values after both passes.
+   */
+  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+                           temp, 64,
+                           filter_x, x_step_q4, filter_y, y_step_q4,
+                           w, intermediate_height);
+  vp9_convolve8_avg_vert_neon(temp + 64 * 3,
+                              64, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+}
diff --git a/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
new file mode 100644
index 0000000..60a0d98
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
@@ -0,0 +1,69 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_dc_only_idct_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr,
+;                            uint8_t *dst_ptr, int pitch, int stride)
+;
+; r0  int input_dc
+; r1  uint8_t *pred_ptr
+; r2  uint8_t *dst_ptr
+; r3  int pitch
+; sp  int stride
+
+|vp9_dc_only_idct_add_neon| PROC
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    mul              r0, r0, r12               ; input_dc * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; ROUND_POWER_OF_TWO(out, 4)
+    add              r0, r0, #8                ; + (1 <<((4) - 1))
+    asr              r0, r0, #4                ; >> 4
+
+    vdup.16         q0, r0;                   ; duplicate a1
+    ldr              r12, [sp]                 ; load stride
+
+    vld1.32         {d2[0]}, [r1], r3
+    vld1.32         {d2[1]}, [r1], r3
+    vld1.32         {d4[0]}, [r1], r3
+    vld1.32         {d4[1]}, [r1]
+
+    vaddw.u8        q1, q0, d2                ; a1 + pred_ptr[c]
+    vaddw.u8        q2, q0, d4
+
+    vqmovun.s16     d2, q1                    ; clip_pixel
+    vqmovun.s16     d4, q2
+
+    vst1.32         {d2[0]}, [r2], r12
+    vst1.32         {d2[1]}, [r2], r12
+    vst1.32         {d4[0]}, [r2], r12
+    vst1.32         {d4[1]}, [r2]
+
+    bx               lr
+    ENDP             ; |vp9_dc_only_idct_add_neon|
+
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
new file mode 100644
index 0000000..8b4fe5d
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
@@ -0,0 +1,708 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_loop_filter_horizontal_edge_neon|
+    EXPORT  |vp9_loop_filter_vertical_edge_neon|
+    EXPORT  |vp9_mbloop_filter_horizontal_edge_neon|
+    EXPORT  |vp9_mbloop_filter_vertical_edge_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_loop_filter_horizontal_edge_neon(uint8_t *s,
+;                                           int p /* pitch */,
+;                                           const uint8_t *blimit,
+;                                           const uint8_t *limit,
+;                                           const uint8_t *thresh,
+;                                           int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_loop_filter_horizontal_edge_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r12, [sp, #8]              ; load count
+    ldr         r2, [sp, #4]               ; load thresh
+    add         r1, r1, r1                 ; double pitch
+
+    cmp         r12, #0
+    beq         end_vp9_lf_h_edge
+
+    vld1.8      {d1[]}, [r3]               ; duplicate *limit
+    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+
+count_lf_h_loop
+    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
+    add         r3, r2, r1, lsr #1         ; set to 3 lines down
+
+    vld1.u8     {d3}, [r2@64], r1          ; p3
+    vld1.u8     {d4}, [r3@64], r1          ; p2
+    vld1.u8     {d5}, [r2@64], r1          ; p1
+    vld1.u8     {d6}, [r3@64], r1          ; p0
+    vld1.u8     {d7}, [r2@64], r1          ; q0
+    vld1.u8     {d16}, [r3@64], r1         ; q1
+    vld1.u8     {d17}, [r2@64]             ; q2
+    vld1.u8     {d18}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          vp9_loop_filter_neon
+
+    vst1.u8     {d4}, [r2@64], r1          ; store op1
+    vst1.u8     {d5}, [r3@64], r1          ; store op0
+    vst1.u8     {d6}, [r2@64], r1          ; store oq0
+    vst1.u8     {d7}, [r3@64], r1          ; store oq1
+
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         count_lf_h_loop
+
+end_vp9_lf_h_edge
+    pop         {pc}
+    ENDP        ; |vp9_loop_filter_horizontal_edge_neon|
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_loop_filter_vertical_edge_neon(uint8_t *s,
+;                                         int p /* pitch */,
+;                                         const uint8_t *blimit,
+;                                         const uint8_t *limit,
+;                                         const uint8_t *thresh,
+;                                         int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_loop_filter_vertical_edge_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    ldr         r12, [sp, #8]             ; load count
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #4]              ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
+    cmp         r12, #0
+    beq         end_vp9_lf_v_edge
+
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
+
+count_lf_v_loop
+    vld1.u8     {d3}, [r2], r1             ; load s data
+    vld1.u8     {d4}, [r2], r1
+    vld1.u8     {d5}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d7}, [r2], r1
+    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d18}, [r2]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     d3, d7
+    vtrn.32     d4, d16
+    vtrn.32     d5, d17
+    vtrn.32     d6, d18
+
+    vtrn.16     d3, d5
+    vtrn.16     d4, d6
+    vtrn.16     d7, d17
+    vtrn.16     d16, d18
+
+    vtrn.8      d3, d4
+    vtrn.8      d5, d6
+    vtrn.8      d7, d16
+    vtrn.8      d17, d18
+
+    bl          vp9_loop_filter_neon
+
+    sub         r0, r0, #2
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+    add         r0, r0, r1, lsl #3         ; s += pitch * 8
+    subs        r12, r12, #1
+    subne       r2, r0, #4                 ; move s pointer down by 4 columns
+    bne         count_lf_v_loop
+
+end_vp9_lf_v_edge
+    pop         {pc}
+    ENDP        ; |vp9_loop_filter_vertical_edge_neon|
+
+; void vp9_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0    blimit
+; d1    limit
+; d2    thresh
+; d3    p3
+; d4    p2
+; d5    p1
+; d6    p0
+; d7    q0
+; d16   q1
+; d17   q2
+; d18   q3
+;
+; Outputs:
+; d4    op1
+; d5    op0
+; d6    oq0
+; d7    oq1
+|vp9_loop_filter_neon| PROC
+    ; filter_mask
+    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
+    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
+    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
+
+    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
+
+    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
+
+    vmov.u8     d18, #0x80
+
+    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
+
+    ; hevmask
+    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
+
+    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
+    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
+
+    veor        d7, d7, d18                 ; qs0
+
+    vcge.u8     d23, d1, d23                ; abs(m1) > limit
+
+    ; filter() function
+    ; convert to signed
+
+    vshr.u8     d28, d28, #1                ; a = a / 2
+    veor        d6, d6, d18                 ; ps0
+
+    veor        d5, d5, d18                 ; ps1
+    vqadd.u8    d17, d17, d28               ; a = b + a
+
+    veor        d16, d16, d18               ; qs1
+
+    vmov.u8     d19, #3
+
+    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
+
+    vcge.u8     d17, d0, d17                ; a > blimit
+
+    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
+    vorr        d22, d21, d22               ; hevmask
+
+    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
+
+    vand        d27, d27, d22               ; filter &= hev
+    vand        d23, d23, d17               ; filter_mask
+
+    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d17, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d27, q12
+
+    vand        d27, d27, d23               ; filter &= mask
+
+    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
+    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
+    vshr.s8     d28, d28, #3                ; filter2 >>= 3
+    vshr.s8     d27, d27, #3                ; filter1 >>= 3
+
+    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
+    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
+
+    veor        d6, d26, d18                ; *oq0 = u^0x80
+
+    vbic        d27, d27, d22               ; filter &= ~hev
+
+    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
+    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
+
+    veor        d5, d19, d18                ; *op0 = u^0x80
+    veor        d4, d21, d18                ; *op1 = u^0x80
+    veor        d7, d20, d18                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vp9_loop_filter_neon|
+
+; void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int p,
+;                                             const uint8_t *blimit,
+;                                             const uint8_t *limit,
+;                                             const uint8_t *thresh,
+;                                             int count)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_mbloop_filter_horizontal_edge_neon| PROC
+    push        {r4-r5, lr}
+
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r12, [sp, #16]             ; load count
+    ldr         r2, [sp, #12]              ; load thresh
+    add         r1, r1, r1                 ; double pitch
+
+    cmp         r12, #0
+    beq         end_vp9_mblf_h_edge
+
+    vld1.8      {d1[]}, [r3]               ; duplicate *limit
+    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+
+count_mblf_h_loop
+    sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines
+    add         r2, r3, r1, lsr #1         ; set to 3 lines down
+
+    vld1.u8     {d3}, [r3@64], r1          ; p3
+    vld1.u8     {d4}, [r2@64], r1          ; p2
+    vld1.u8     {d5}, [r3@64], r1          ; p1
+    vld1.u8     {d6}, [r2@64], r1          ; p0
+    vld1.u8     {d7}, [r3@64], r1          ; q0
+    vld1.u8     {d16}, [r2@64], r1         ; q1
+    vld1.u8     {d17}, [r3@64]             ; q2
+    vld1.u8     {d18}, [r2@64], r1         ; q3
+
+    sub         r3, r3, r1, lsl #1
+    sub         r2, r2, r1, lsl #2
+
+    bl          vp9_mbloop_filter_neon
+
+    vst1.u8     {d0}, [r2@64], r1          ; store op2
+    vst1.u8     {d1}, [r3@64], r1          ; store op1
+    vst1.u8     {d2}, [r2@64], r1          ; store op0
+    vst1.u8     {d3}, [r3@64], r1          ; store oq0
+    vst1.u8     {d4}, [r2@64], r1          ; store oq1
+    vst1.u8     {d5}, [r3@64], r1          ; store oq2
+
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         count_mblf_h_loop
+
+end_vp9_mblf_h_edge
+    pop         {r4-r5, pc}
+
+    ENDP        ; |vp9_mbloop_filter_horizontal_edge_neon|
+
+; void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s,
+;                                           int pitch,
+;                                           const uint8_t *blimit,
+;                                           const uint8_t *limit,
+;                                           const uint8_t *thresh,
+;                                           int count)
+;
+; r0    uint8_t *s,
+; r1    int pitch,
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_mbloop_filter_vertical_edge_neon| PROC
+    push        {r4-r5, lr}
+
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    ldr         r12, [sp, #16]            ; load count
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #12]             ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
+    cmp         r12, #0
+    beq         end_vp9_mblf_v_edge
+
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
+
+count_mblf_v_loop
+    vld1.u8     {d3}, [r2], r1             ; load s data
+    vld1.u8     {d4}, [r2], r1
+    vld1.u8     {d5}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d7}, [r2], r1
+    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d18}, [r2]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     d3, d7
+    vtrn.32     d4, d16
+    vtrn.32     d5, d17
+    vtrn.32     d6, d18
+
+    vtrn.16     d3, d5
+    vtrn.16     d4, d6
+    vtrn.16     d7, d17
+    vtrn.16     d16, d18
+
+    vtrn.8      d3, d4
+    vtrn.8      d5, d6
+    vtrn.8      d7, d16
+    vtrn.8      d17, d18
+
+    sub         r2, r0, #3
+    add         r3, r0, #1
+
+    bl          vp9_mbloop_filter_neon
+
+    ;store op2, op1, op0, oq0
+    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
+    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
+    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
+    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
+    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
+    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
+    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
+    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r2]
+
+    ;store oq1, oq2
+    vst2.8      {d4[0], d5[0]}, [r3], r1
+    vst2.8      {d4[1], d5[1]}, [r3], r1
+    vst2.8      {d4[2], d5[2]}, [r3], r1
+    vst2.8      {d4[3], d5[3]}, [r3], r1
+    vst2.8      {d4[4], d5[4]}, [r3], r1
+    vst2.8      {d4[5], d5[5]}, [r3], r1
+    vst2.8      {d4[6], d5[6]}, [r3], r1
+    vst2.8      {d4[7], d5[7]}, [r3]
+
+    add         r0, r0, r1, lsl #3         ; s += pitch * 8
+    subs        r12, r12, #1
+    subne       r2, r0, #4                 ; move s pointer down by 4 columns
+    bne         count_mblf_v_loop
+
+end_vp9_mblf_v_edge
+    pop         {r4-r5, pc}
+    ENDP        ; |vp9_mbloop_filter_vertical_edge_neon|
+
+; void vp9_mbloop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0    blimit
+; d1    limit
+; d2    thresh
+; d3    p3
+; d4    p2
+; d5    p1
+; d6    p0
+; d7    q0
+; d16   q1
+; d17   q2
+; d18   q3
+;
+; Outputs:
+; d0    op2
+; d1    op1
+; d2    op0
+; d3    oq0
+; d4    oq1
+; d5    oq2
+|vp9_mbloop_filter_neon| PROC
+    ; filter_mask
+    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7               ; m4 = abs(q1 - q0)
+    vabd.u8     d23, d17, d16              ; m5 = abs(q2 - q1)
+    vabd.u8     d24, d18, d17              ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20              ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22              ; m2 = max(m3, m4)
+
+    vabd.u8     d25, d6, d4                ; m7 = abs(p0 - p2)
+
+    vmax.u8     d23, d23, d24              ; m3 = max(m5, m6)
+
+    vabd.u8     d26, d7, d17               ; m8 = abs(q0 - q2)
+
+    vmax.u8     d19, d19, d20
+
+    vabd.u8     d24, d6, d7                ; m9 = abs(p0 - q0)
+    vabd.u8     d27, d3, d6                ; m10 = abs(p3 - p0)
+    vabd.u8     d28, d18, d7               ; m11 = abs(q3 - q0)
+
+    vmax.u8     d19, d19, d23
+
+    vabd.u8     d23, d5, d16               ; a = abs(p1 - q1)
+    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
+
+    ; abs () > limit
+    vcge.u8     d19, d1, d19
+
+    ; only compare the largest value to thresh
+    vmax.u8     d25, d25, d26              ; m4 = max(m7, m8)
+    vmax.u8     d26, d27, d28              ; m5 = max(m10, m11)
+
+    vshr.u8     d23, d23, #1               ; a = a / 2
+
+    vmax.u8     d25, d25, d26              ; m4 = max(m4, m5)
+
+    vqadd.u8    d24, d24, d23              ; a = b + a
+
+    vmax.u8     d20, d20, d25              ; m2 = max(m2, m4)
+
+    vmov.u8     d23, #1
+    vcge.u8     d24, d0, d24               ; a > blimit
+
+    vcgt.u8     d21, d21, d2               ; (abs(p1 - p0) > thresh)*-1
+
+    vcge.u8     d20, d23, d20              ; flat
+
+    vand        d19, d19, d24              ; mask
+
+    vcgt.u8     d23, d22, d2               ; (abs(q1 - q0) > thresh)*-1
+
+    vand        d20, d20, d19              ; flat & mask
+
+    vmov.u8     d22, #0x80
+
+    vorr        d23, d21, d23              ; hev
+
+    ; This instruction will truncate the "flat & mask" masks down to 4 bits
+    ; each to fit into one 32 bit arm register. The values are stored in
+    ; q10.64[0].
+    vshrn.u16   d30, q10, #4
+    vmov.u32    r4, d30[0]                 ; flat & mask 4bits
+
+    adds        r5, r4, #1                 ; Check for all 1's
+
+    ; If mask and flat are 1's for all vectors, then we only need to execute
+    ; the power branch for all vectors.
+    beq         power_branch_only
+
+    cmp         r4, #0                     ; Check for 0, set flag for later
+
+    ; mbfilter() function
+    ; filter() function
+    ; convert to signed
+    veor        d21, d7, d22               ; qs0
+    veor        d24, d6, d22               ; ps0
+    veor        d25, d5, d22               ; ps1
+    veor        d26, d16, d22              ; qs1
+
+    vmov.u8     d27, #3
+
+    vsub.s8     d28, d21, d24              ; ( qs0 - ps0)
+
+    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
+
+    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
+
+    vand        d29, d29, d23              ; filter &= hev
+
+    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d29, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d28, q15
+
+    vand        d28, d28, d19              ; filter &= mask
+
+    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
+    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
+    vshr.s8     d30, d30, #3               ; filter2 >>= 3
+    vshr.s8     d29, d29, #3               ; filter1 >>= 3
+
+    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
+    vqsub.s8    d21, d21, d29              ; oq0 = clamp(qs0 - filter1)
+
+    ; outer tap adjustments: ++filter1 >> 1
+    vrshr.s8    d29, d29, #1
+    vbic        d29, d29, d23              ; filter &= ~hev
+
+    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
+    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
+
+    ; If mask and flat are 0's for all vectors, then we only need to execute
+    ; the filter branch for all vectors.
+    beq         filter_branch_only
+
+    ; If mask and flat are mixed then we must perform both branches and
+    ; combine the data.
+    veor        d24, d24, d22              ; *f_op0 = u^0x80
+    veor        d21, d21, d22              ; *f_oq0 = u^0x80
+    veor        d25, d25, d22              ; *f_op1 = u^0x80
+    veor        d26, d26, d22              ; *f_oq1 = u^0x80
+
+    ; At this point we have already executed the filter branch. The filter
+    ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
+    ; branch and combine the data.
+    vmov.u8     d23, #2
+    vaddl.u8    q14, d6, d7                ; r_op2 = p0 + q0
+    vmlal.u8    q14, d3, d27               ; r_op2 += p3 * 3
+    vmlal.u8    q14, d4, d23               ; r_op2 += p2 * 2
+
+    vbif        d0, d4, d20                ; op2 |= p2 & ~(flat & mask)
+
+    vaddw.u8    q14, d5                    ; r_op2 += p1
+
+    vbif        d1, d25, d20               ; op1 |= f_op1 & ~(flat & mask)
+
+    vqrshrn.u16 d30, q14, #3               ; r_op2
+
+    vsubw.u8    q14, d3                    ; r_op1 = r_op2 - p3
+    vsubw.u8    q14, d4                    ; r_op1 -= p2
+    vaddw.u8    q14, d5                    ; r_op1 += p1
+    vaddw.u8    q14, d16                   ; r_op1 += q1
+
+    vbif        d2, d24, d20               ; op0 |= f_op0 & ~(flat & mask)
+
+    vqrshrn.u16 d31, q14, #3               ; r_op1
+
+    vsubw.u8    q14, d3                    ; r_op0 = r_op1 - p3
+    vsubw.u8    q14, d5                    ; r_op0 -= p1
+    vaddw.u8    q14, d6                    ; r_op0 += p0
+    vaddw.u8    q14, d17                   ; r_op0 += q2
+
+    vbit        d0, d30, d20               ; op2 |= r_op2 & (flat & mask)
+
+    vqrshrn.u16 d23, q14, #3               ; r_op0
+
+    vsubw.u8    q14, d3                    ; r_oq0 = r_op0 - p3
+    vsubw.u8    q14, d6                    ; r_oq0 -= p0
+    vaddw.u8    q14, d7                    ; r_oq0 += q0
+
+    vbit        d1, d31, d20               ; op1 |= r_op1 & (flat & mask)
+
+    vaddw.u8    q14, d18                   ; oq0 += q3
+
+    vbit        d2, d23, d20               ; op0 |= r_op0 & (flat & mask)
+
+    vqrshrn.u16 d22, q14, #3               ; r_oq0
+
+    vsubw.u8    q14, d4                    ; r_oq1 = r_oq0 - p2
+    vsubw.u8    q14, d7                    ; r_oq1 -= q0
+    vaddw.u8    q14, d16                   ; r_oq1 += q1
+
+    vbif        d3, d21, d20               ; oq0 |= f_oq0 & ~(flat & mask)
+
+    vaddw.u8    q14, d18                   ; r_oq1 += q3
+
+    vbif        d4, d26, d20               ; oq1 |= f_oq1 & ~(flat & mask)
+
+    vqrshrn.u16 d6, q14, #3                ; r_oq1
+
+    vsubw.u8    q14, d5                    ; r_oq2 = r_oq1 - p1
+    vsubw.u8    q14, d16                   ; r_oq2 -= q1
+    vaddw.u8    q14, d17                   ; r_oq2 += q2
+    vaddw.u8    q14, d18                   ; r_oq2 += q3
+
+    vbif        d5, d17, d20               ; oq2 |= q2 & ~(flat & mask)
+
+    vqrshrn.u16 d7, q14, #3                ; r_oq2
+
+    vbit        d3, d22, d20               ; oq0 |= r_oq0 & (flat & mask)
+    vbit        d4, d6, d20                ; oq1 |= r_oq1 & (flat & mask)
+    vbit        d5, d7, d20                ; oq2 |= r_oq2 & (flat & mask)
+
+    bx          lr
+
+power_branch_only
+    vmov.u8     d27, #3
+    vmov.u8     d21, #2
+    vaddl.u8    q14, d6, d7                ; op2 = p0 + q0
+    vmlal.u8    q14, d3, d27               ; op2 += p3 * 3
+    vmlal.u8    q14, d4, d21               ; op2 += p2 * 2
+    vaddw.u8    q14, d5                    ; op2 += p1
+    vqrshrn.u16 d0, q14, #3                ; op2
+
+    vsubw.u8    q14, d3                    ; op1 = op2 - p3
+    vsubw.u8    q14, d4                    ; op1 -= p2
+    vaddw.u8    q14, d5                    ; op1 += p1
+    vaddw.u8    q14, d16                   ; op1 += q1
+    vqrshrn.u16 d1, q14, #3                ; op1
+
+    vsubw.u8    q14, d3                    ; op0 = op1 - p3
+    vsubw.u8    q14, d5                    ; op0 -= p1
+    vaddw.u8    q14, d6                    ; op0 += p0
+    vaddw.u8    q14, d17                   ; op0 += q2
+    vqrshrn.u16 d2, q14, #3                ; op0
+
+    vsubw.u8    q14, d3                    ; oq0 = op0 - p3
+    vsubw.u8    q14, d6                    ; oq0 -= p0
+    vaddw.u8    q14, d7                    ; oq0 += q0
+    vaddw.u8    q14, d18                   ; oq0 += q3
+    vqrshrn.u16 d3, q14, #3                ; oq0
+
+    vsubw.u8    q14, d4                    ; oq1 = oq0 - p2
+    vsubw.u8    q14, d7                    ; oq1 -= q0
+    vaddw.u8    q14, d16                   ; oq1 += q1
+    vaddw.u8    q14, d18                   ; oq1 += q3
+    vqrshrn.u16 d4, q14, #3                ; oq1
+
+    vsubw.u8    q14, d5                    ; oq2 = oq1 - p1
+    vsubw.u8    q14, d16                   ; oq2 -= q1
+    vaddw.u8    q14, d17                   ; oq2 += q2
+    vaddw.u8    q14, d18                   ; oq2 += q3
+    vqrshrn.u16 d5, q14, #3                ; oq2
+
+    bx          lr
+
+filter_branch_only
+    ; TODO(fgalligan): See if we can rearange registers so we do not need to
+    ; do the 2 vswp.
+    vswp        d0, d4                      ; op2
+    vswp        d5, d17                     ; oq2
+    veor        d2, d24, d22                ; *op0 = u^0x80
+    veor        d3, d21, d22                ; *oq0 = u^0x80
+    veor        d1, d25, d22                ; *op1 = u^0x80
+    veor        d4, d26, d22                ; *oq1 = u^0x80
+
+    bx          lr
+
+    ENDP        ; |vp9_mbloop_filter_neon|
+
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
new file mode 100644
index 0000000..8e4aada
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -0,0 +1,356 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_idct8x8_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
+    ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
+    ; This macro will touch q0-q7 registers and use them as buffer during
+    ; calculation.
+    MACRO
+    IDCT8x8_1D
+    ; stage 1
+    vdup.16         d0, r3;                   ; duplicate cospi_28_64
+    vdup.16         d1, r4;                   ; duplicate cospi_4_64
+
+    ; input[1] * cospi_28_64
+    vmull.s16       q2, d18, d0
+    vmull.s16       q3, d19, d0
+
+    ; input[7] * cospi_4_64
+    vmull.s16       q4, d30, d1
+    vmull.s16       q5, d31, d1
+
+    ; input[1]*cospi_28_64-input[7]*cospi_4_64
+    vsub.s32        q6, q2, q4
+    vsub.s32        q7, q3, q5
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d8, q6, #14               ; >> 14
+    vqrshrn.s32     d9, q7, #14               ; >> 14
+
+    ; input[1] * cospi_4_64
+    vmull.s16       q2, d18, d1
+    vmull.s16       q3, d19, d1
+
+    ; input[7] * cospi_28_64
+    vmull.s16       q1, d30, d0
+    vmull.s16       q5, d31, d0
+
+    ; input[1]*cospi_4_64+input[7]*cospi_28_64
+    vadd.s32        q2, q2, q1
+    vadd.s32        q3, q3, q5
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d14, q2, #14              ; >> 14
+    vqrshrn.s32     d15, q3, #14              ; >> 14
+
+    vdup.16         d0, r5;                   ; duplicate cospi_12_64
+    vdup.16         d1, r6;                   ; duplicate cospi_20_64
+
+    ; input[5] * cospi_12_64
+    vmull.s16       q2, d26, d0
+    vmull.s16       q3, d27, d0
+
+    ; input[3] * cospi_20_64
+    vmull.s16       q5, d22, d1
+    vmull.s16       q6, d23, d1
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vsub.s32        q2, q2, q5
+    vsub.s32        q3, q3, q6
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q2, #14              ; >> 14
+    vqrshrn.s32     d11, q3, #14              ; >> 14
+
+    ; input[5] * cospi_20_64
+    vmull.s16       q2, d26, d1
+    vmull.s16       q3, d27, d1
+
+    ; input[3] * cospi_12_64
+    vmull.s16       q9, d22, d0
+    vmull.s16       q15, d23, d0
+
+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+    vadd.s32        q0, q2, q9
+    vadd.s32        q1, q3, q15
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q0, #14              ; >> 14
+    vqrshrn.s32     d13, q1, #14              ; >> 14
+
+    ; stage 2 & stage 3 - even half
+    vdup.16         d0, r7;                   ; duplicate cospi_16_64
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q2, d16, d0
+    vmull.s16       q3, d17, d0
+
+    ; input[2] * cospi_16_64
+    vmull.s16       q9,  d24, d0
+    vmull.s16       q11, d25, d0
+
+    ; (input[0] + input[2]) * cospi_16_64
+    vadd.s32        q9, q2, q9
+    vadd.s32        q11, q3, q11
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q9, #14              ; >> 14
+    vqrshrn.s32     d19, q11, #14             ; >> 14
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q2, d16, d0
+    vmull.s16       q3, d17, d0
+
+    ; input[2] * cospi_16_64
+    vmull.s16       q13,  d24, d0
+    vmull.s16       q15, d25, d0
+
+    ; (input[0] - input[2]) * cospi_16_64
+    vsub.s32        q2, q2, q13
+    vsub.s32        q3, q3, q15
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d22, q2, #14              ; >> 14
+    vqrshrn.s32     d23, q3, #14              ; >> 14
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vdup.16         d0, r8;                   ; duplicate cospi_24_64
+    vdup.16         d1, r9;                   ; duplicate cospi_8_64
+
+    ; input[1] * cospi_24_64
+    vmull.s16       q2, d20, d0
+    vmull.s16       q3, d21, d0
+
+    ; input[3] * cospi_8_64
+    vmull.s16       q13, d28, d1
+    vmull.s16       q15, d29, d1
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vsub.s32        q2, q2, q13
+    vsub.s32        q3, q3, q15
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d26, q2, #14              ; >> 14
+    vqrshrn.s32     d27, q3, #14              ; >> 14
+
+    ; input[1] * cospi_8_64
+    vmull.s16       q2, d20, d1
+    vmull.s16       q3, d21, d1
+
+    ; input[3] * cospi_24_64
+    vmull.s16       q8, d28, d0
+    vmull.s16       q10, d29, d0
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vadd.s32        q0, q2, q8
+    vadd.s32        q1, q3, q10
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d30, q0, #14              ; >> 14
+    vqrshrn.s32     d31, q1, #14              ; >> 14
+
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7;                   ; duplicate cospi_16_64
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[5] * cospi_16_64
+    vmull.s16       q11, d26, d16
+    vmull.s16       q12, d27, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vsub.s32        q9, q9, q11
+    vsub.s32        q10, q10, q12
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[5] * cospi_16_64
+    vmull.s16       q11, d26, d16
+    vmull.s16       q12, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vadd.s32        q9, q9, q11
+    vadd.s32        q10, q10, q12
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q10, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7;               ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6;               ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5;              ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4;              ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4;              ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5;              ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6;              ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7;              ; output[7] = step1[0] - step1[7];
+    MEND
+
+    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+    MACRO
+    TRANSPOSE8X8
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    MEND
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct8x8_add_neon| PROC
+    push            {r4-r9}
+    vld1.s16        {q8}, [r0]!
+    vld1.s16        {q9}, [r0]!
+    vld1.s16        {q10}, [r0]!
+    vld1.s16        {q11}, [r0]!
+    vld1.s16        {q12}, [r0]!
+    vld1.s16        {q13}, [r0]!
+    vld1.s16        {q14}, [r0]!
+    vld1.s16        {q15}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+
+    ; First transform rows
+    IDCT8x8_1D
+
+    ; Transpose the matrix
+    TRANSPOSE8X8
+
+    ; Then transform columns
+    IDCT8x8_1D
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.u8         {d0}, [r1], r2
+    vld1.u8         {d1}, [r1], r2
+    vld1.s16        {d2}, [r1], r2
+    vld1.s16        {d3}, [r1], r2
+    vld1.s16        {d4}, [r1], r2
+    vld1.s16        {d5}, [r1], r2
+    vld1.s16        {d6}, [r1], r2
+    vld1.s16        {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+
+    pop             {r4-r9}
+    bx              lr
+    ENDP  ; |vp9_short_idct8x8_add_neon|
+
+    END
diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c
index 2660344..554a317 100644
--- a/libvpx/vp9/common/vp9_alloccommon.c
+++ b/libvpx/vp9/common/vp9_alloccommon.c
@@ -11,6 +11,7 @@
 
 #include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
+
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
@@ -52,7 +53,6 @@ void vp9_free_frame_buffers(VP9_COMMON *oci) {
   for (i = 0; i < NUM_YV12_BUFFERS; i++)
     vp9_free_frame_buffer(&oci->yv12_fb[i]);
 
-  vp9_free_frame_buffer(&oci->temp_scale_frame);
   vp9_free_frame_buffer(&oci->post_proc_buffer);
 
   vpx_free(oci->mip);
@@ -62,9 +62,9 @@ void vp9_free_frame_buffers(VP9_COMMON *oci) {
   vpx_free(oci->above_context[0]);
   for (i = 0; i < MAX_MB_PLANE; i++)
     oci->above_context[i] = 0;
-  oci->mip = 0;
-  oci->prev_mip = 0;
-  oci->above_seg_context = 0;
+  oci->mip = NULL;
+  oci->prev_mip = NULL;
+  oci->above_seg_context = NULL;
 }
 
 static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
@@ -74,7 +74,7 @@ static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
 
   cm->mi_cols = aligned_width >> LOG2_MI_SIZE;
   cm->mi_rows = aligned_height >> LOG2_MI_SIZE;
-  cm->mode_info_stride = cm->mi_cols + 64 / MI_SIZE;
+  cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE;
 }
 
 static void setup_mi(VP9_COMMON *cm) {
@@ -94,11 +94,11 @@ static void setup_mi(VP9_COMMON *cm) {
 int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
   int i, mi_cols;
 
-  // Our internal buffers are always multiples of 16
-  const int aligned_width = multiple8(width);
-  const int aligned_height = multiple8(height);
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, LOG2_MI_SIZE);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, LOG2_MI_SIZE);
   const int ss_x = oci->subsampling_x;
   const int ss_y = oci->subsampling_y;
+  int mi_size;
 
   vp9_free_frame_buffers(oci);
 
@@ -120,10 +120,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
     oci->fb_idx_ref_cnt[i] = 1;
   }
 
-  if (vp9_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, ss_x, ss_y,
-                             VP9BORDERINPIXELS) < 0)
-    goto fail;
-
   if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y,
                              VP9BORDERINPIXELS) < 0)
     goto fail;
@@ -131,14 +127,13 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
   set_mb_mi(oci, aligned_width, aligned_height);
 
   // Allocation
-  oci->mip = vpx_calloc(oci->mode_info_stride * (oci->mi_rows + 64 / MI_SIZE),
-                        sizeof(MODE_INFO));
+  mi_size = oci->mode_info_stride * (oci->mi_rows + MI_BLOCK_SIZE);
+
+  oci->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
   if (!oci->mip)
     goto fail;
 
-  oci->prev_mip = vpx_calloc(oci->mode_info_stride *
-                             (oci->mi_rows + 64 / MI_SIZE),
-                             sizeof(MODE_INFO));
+  oci->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
   if (!oci->prev_mip)
     goto fail;
 
@@ -146,7 +141,7 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
 
   // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
   // information is exposed at this level
-  mi_cols = mi_cols_aligned_to_sb(oci);
+  mi_cols = mi_cols_aligned_to_sb(oci->mi_cols);
 
   // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
   // block where mi unit size is 8x8.
@@ -158,10 +153,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
   if (!oci->above_context[0])
     goto fail;
 
-  for (i = 1; i < MAX_MB_PLANE; i++)
-    oci->above_context[i] =
-        oci->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
-
   oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
   if (!oci->above_seg_context)
     goto fail;
@@ -178,9 +169,8 @@ void vp9_create_common(VP9_COMMON *oci) {
 
   vp9_init_mbmode_probs(oci);
 
-  oci->txfm_mode = ONLY_4X4;
+  oci->tx_mode = ONLY_4X4;
   oci->comp_pred_mode = HYBRID_PREDICTION;
-  oci->clr_type = REG_YUV;
 
   // Initialize reference frame sign bias structure to defaults
   vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
@@ -197,9 +187,15 @@ void vp9_initialize_common() {
 }
 
 void vp9_update_frame_size(VP9_COMMON *cm) {
-  const int aligned_width = multiple8(cm->width);
-  const int aligned_height = multiple8(cm->height);
+  int i, mi_cols;
+  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, LOG2_MI_SIZE);
+  const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, LOG2_MI_SIZE);
 
   set_mb_mi(cm, aligned_width, aligned_height);
   setup_mi(cm);
+
+  mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  for (i = 1; i < MAX_MB_PLANE; i++)
+    cm->above_context[i] =
+        cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
 }
diff --git a/libvpx/vp9/common/vp9_asm_com_offsets.c b/libvpx/vp9/common/vp9_asm_com_offsets.c
deleted file mode 100644
index 94ccb6e..0000000
--- a/libvpx/vp9/common/vp9_asm_com_offsets.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vpx/vpx_codec.h"
-#include "vpx_ports/asm_offsets.h"
-
-BEGIN
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h
index 37d29af..1297114 100644
--- a/libvpx/vp9/common/vp9_blockd.h
+++ b/libvpx/vp9/common/vp9_blockd.h
@@ -13,28 +13,25 @@
 #define VP9_COMMON_VP9_BLOCKD_H_
 
 #include "./vpx_config.h"
+
+#include "vpx_ports/mem.h"
 #include "vpx_scale/yv12config.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_common_data.h"
 #include "vp9/common/vp9_convolve.h"
+#include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_mv.h"
+#include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_treecoder.h"
-#include "vpx_ports/mem.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_enums.h"
 
 #define BLOCK_SIZE_GROUPS   4
-#define MAX_MB_SEGMENTS     8
-#define MB_SEG_TREE_PROBS   (MAX_MB_SEGMENTS-1)
 
 #define PREDICTION_PROBS 3
 
 #define MBSKIP_CONTEXTS 3
 
-#define MAX_REF_LF_DELTAS       4
-#define MAX_MODE_LF_DELTAS      2
-
 /* Segment Feature Masks */
-#define SEGMENT_DELTADATA   0
-#define SEGMENT_ABSDATA     1
 #define MAX_MV_REF_CANDIDATES 2
 
 #define INTRA_INTER_CONTEXTS 4
@@ -87,56 +84,28 @@ typedef enum {
   MB_MODE_COUNT
 } MB_PREDICTION_MODE;
 
+static INLINE int is_intra_mode(MB_PREDICTION_MODE mode) {
+  return mode <= TM_PRED;
+}
+
 static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
   return mode >= NEARESTMV && mode <= NEWMV;
 }
 
-// Segment level features.
-typedef enum {
-  SEG_LVL_ALT_Q = 0,               // Use alternate Quantizer ....
-  SEG_LVL_ALT_LF = 1,              // Use alternate loop filter value...
-  SEG_LVL_REF_FRAME = 2,           // Optional Segment reference frame
-  SEG_LVL_SKIP = 3,                // Optional Segment (0,0) + skip mode
-  SEG_LVL_MAX = 4                  // Number of MB level features supported
-} SEG_LVL_FEATURES;
-
-// Segment level features.
-typedef enum {
-  TX_4X4 = 0,                      // 4x4 dct transform
-  TX_8X8 = 1,                      // 8x8 dct transform
-  TX_16X16 = 2,                    // 16x16 dct transform
-  TX_32X32 = 3,                    // 32x32 dct transform
-  TX_SIZE_MAX_SB,                  // Number of transforms available to SBs
-} TX_SIZE;
-
-typedef enum {
-  DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
-  ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
-  DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
-  ADST_ADST = 3                       // ADST in both directions
-} TX_TYPE;
-
 #define VP9_INTRA_MODES (TM_PRED + 1)
 
 #define VP9_INTER_MODES (1 + NEWMV - NEARESTMV)
 
-#define WHT_UPSCALE_FACTOR 2
-
-#define TX_SIZE_PROBS  6  // (TX_SIZE_MAX_SB * (TX_SIZE_MAX_SB - 1) / 2)
-
-#define get_tx_probs(c, b) ((b) < BLOCK_SIZE_MB16X16 ? \
-                            (c)->fc.tx_probs_8x8p :    \
-                            (b) < BLOCK_SIZE_SB32X32 ? \
-                            (c)->fc.tx_probs_16x16p : (c)->fc.tx_probs_32x32p)
+static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
+  return (mode - NEARESTMV);
+}
 
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
 
 union b_mode_info {
-  struct {
-    MB_PREDICTION_MODE first;
-  } as_mode;
+  MB_PREDICTION_MODE as_mode;
   int_mv as_mv[2];  // first, second inter predictor motion vectors
 };
 
@@ -150,60 +119,18 @@ typedef enum {
 } MV_REFERENCE_FRAME;
 
 static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) {
-  switch (sb_type) {
-    case BLOCK_SIZE_SB4X8:
-    case BLOCK_SIZE_AB4X4: return 0;
-    case BLOCK_SIZE_SB8X4:
-    case BLOCK_SIZE_SB8X8:
-    case BLOCK_SIZE_SB8X16: return 1;
-    case BLOCK_SIZE_SB16X8:
-    case BLOCK_SIZE_MB16X16:
-    case BLOCK_SIZE_SB16X32: return 2;
-    case BLOCK_SIZE_SB32X16:
-    case BLOCK_SIZE_SB32X32:
-    case BLOCK_SIZE_SB32X64: return 3;
-    case BLOCK_SIZE_SB64X32:
-    case BLOCK_SIZE_SB64X64: return 4;
-    default: assert(0);
-      return -1;
-  }
+  return b_width_log2_lookup[sb_type];
 }
-
 static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {
-  switch (sb_type) {
-    case BLOCK_SIZE_SB8X4:
-    case BLOCK_SIZE_AB4X4: return 0;
-    case BLOCK_SIZE_SB4X8:
-    case BLOCK_SIZE_SB8X8:
-    case BLOCK_SIZE_SB16X8: return 1;
-    case BLOCK_SIZE_SB8X16:
-    case BLOCK_SIZE_MB16X16:
-    case BLOCK_SIZE_SB32X16: return 2;
-    case BLOCK_SIZE_SB16X32:
-    case BLOCK_SIZE_SB32X32:
-    case BLOCK_SIZE_SB64X32: return 3;
-    case BLOCK_SIZE_SB32X64:
-    case BLOCK_SIZE_SB64X64: return 4;
-    default: assert(0);
-      return -1;
-  }
+  return b_height_log2_lookup[sb_type];
 }
 
 static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
-  int a = b_width_log2(sb_type) - 1;
-  // align 4x4 block to mode_info
-  if (a < 0)
-    a = 0;
-  assert(a >= 0);
-  return a;
+  return mi_width_log2_lookup[sb_type];
 }
 
 static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
-  int a = b_height_log2(sb_type) - 1;
-  if (a < 0)
-    a = 0;
-  assert(a >= 0);
-  return a;
+  return mi_height_log2_lookup[sb_type];
 }
 
 typedef struct {
@@ -214,7 +141,7 @@ typedef struct {
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   int_mv best_mv, best_second_mv;
 
-  int mb_mode_context[MAX_REF_FRAMES];
+  uint8_t mb_mode_context[MAX_REF_FRAMES];
 
   unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
   unsigned char segment_id;           // Segment id for current frame
@@ -237,7 +164,14 @@ typedef struct {
   union b_mode_info bmi[4];
 } MODE_INFO;
 
+enum mv_precision {
+  MV_PRECISION_Q3,
+  MV_PRECISION_Q4
+};
+
 #define VP9_REF_SCALE_SHIFT 14
+#define VP9_REF_NO_SCALE (1 << VP9_REF_SCALE_SHIFT)
+
 struct scale_factors {
   int x_scale_fp;   // horizontal fixed point scale factor
   int y_scale_fp;   // vertical fixed point scale factor
@@ -249,9 +183,8 @@ struct scale_factors {
   int (*scale_value_x)(int val, const struct scale_factors *scale);
   int (*scale_value_y)(int val, const struct scale_factors *scale);
   void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
-  int_mv32 (*scale_mv_q3_to_q4)(const int_mv *src_mv,
-                                const struct scale_factors *scale);
-  int32_t (*scale_mv_component_q4)(int mv_q4, int scale_fp, int offset_q4);
+  MV32 (*scale_mv_q3_to_q4)(const MV *mv, const struct scale_factors *scale);
+  MV32 (*scale_mv_q4)(const MV *mv, const struct scale_factors *scale);
 
   convolve_fn_t predict[2][2][2];  // horiz, vert, avg
 };
@@ -283,71 +216,53 @@ struct macroblockd_plane {
 
 #define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n))
 
+#define MAX_REF_LF_DELTAS       4
+#define MAX_MODE_LF_DELTAS      2
+
+struct loopfilter {
+  int filter_level;
+
+  int sharpness_level;
+  int last_sharpness_level;
+
+  uint8_t mode_ref_delta_enabled;
+  uint8_t mode_ref_delta_update;
+
+  // 0 = Intra, Last, GF, ARF
+  signed char ref_deltas[MAX_REF_LF_DELTAS];
+  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
+
+  // 0 = ZERO_MV, MV
+  signed char mode_deltas[MAX_MODE_LF_DELTAS];
+  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
+};
+
 typedef struct macroblockd {
   struct macroblockd_plane plane[MAX_MB_PLANE];
 
   struct scale_factors scale_factor[2];
-  struct scale_factors scale_factor_uv[2];
 
   MODE_INFO *prev_mode_info_context;
   MODE_INFO *mode_info_context;
   int mode_info_stride;
 
-  FRAME_TYPE frame_type;
-
   int up_available;
   int left_available;
   int right_available;
 
+  struct segmentation seg;
+  struct loopfilter lf;
+
   // partition contexts
   PARTITION_CONTEXT *above_seg_context;
   PARTITION_CONTEXT *left_seg_context;
 
-  /* 0 (disable) 1 (enable) segmentation */
-  unsigned char segmentation_enabled;
-
-  /* 0 (do not update) 1 (update) the macroblock segmentation map. */
-  unsigned char update_mb_segmentation_map;
-
-  /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
-  unsigned char update_mb_segmentation_data;
-
-  /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
-  unsigned char mb_segment_abs_delta;
-
-  /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
-  /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
-
-  // Probability Tree used to code Segment number
-  vp9_prob mb_segment_tree_probs[MB_SEG_TREE_PROBS];
-
-  // Segment features
-  int16_t segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];
-  unsigned int segment_feature_mask[MAX_MB_SEGMENTS];
-
-  /* mode_based Loop filter adjustment */
-  unsigned char mode_ref_lf_delta_enabled;
-  unsigned char mode_ref_lf_delta_update;
-
-  /* Delta values have the range +/- MAX_LOOP_FILTER */
-  /* 0 = Intra, Last, GF, ARF */
-  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
-  /* 0 = Intra, Last, GF, ARF */
-  signed char ref_lf_deltas[MAX_REF_LF_DELTAS];
-  /* 0 = ZERO_MV, MV */
-  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
-  /* 0 = ZERO_MV, MV */
-  signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];
-
   /* Distance of MB away from frame edges */
   int mb_to_left_edge;
   int mb_to_right_edge;
   int mb_to_top_edge;
   int mb_to_bottom_edge;
 
-  unsigned int frames_since_golden;
-  unsigned int frames_till_alt_ref_frame;
-
   int lossless;
   /* Inverse transform function pointers. */
   void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
@@ -360,15 +275,16 @@ typedef struct macroblockd {
 
   int corrupted;
 
-  int sb_index;   // index of 32x32 block inside the 64x64 block
-  int mb_index;   // index of 16x16 block inside the 32x32 block
-  int b_index;    // index of 8x8 block inside the 16x16 block
-  int ab_index;   // index of 4x4 block inside the 8x8 block
+  unsigned char sb_index;   // index of 32x32 block inside the 64x64 block
+  unsigned char mb_index;   // index of 16x16 block inside the 32x32 block
+  unsigned char b_index;    // index of 8x8 block inside the 16x16 block
+  unsigned char ab_index;   // index of 4x4 block inside the 8x8 block
+
   int q_index;
 
 } MACROBLOCKD;
 
-static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
   switch (subsize) {
     case BLOCK_SIZE_SB64X64:
     case BLOCK_SIZE_SB64X32:
@@ -396,38 +312,21 @@ static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
 static INLINE void update_partition_context(MACROBLOCKD *xd,
                                             BLOCK_SIZE_TYPE sb_type,
                                             BLOCK_SIZE_TYPE sb_size) {
-  int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
-  int bwl = b_width_log2(sb_type);
-  int bhl = b_height_log2(sb_type);
-  int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
-  int i;
+  const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
+  const int bwl = b_width_log2(sb_type);
+  const int bhl = b_height_log2(sb_type);
+  const int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+  const char pcval0 = ~(0xe << boffset);
+  const char pcval1 = ~(0xf << boffset);
+  const char pcvalue[2] = {pcval0, pcval1};
+
+  assert(MAX(bwl, bhl) <= bsl);
 
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
   // bits of smaller block sizes to be zero.
-  if ((bwl == bsl) && (bhl == bsl)) {
-    for (i = 0; i < bs; i++)
-      xd->left_seg_context[i] = ~(0xf << boffset);
-    for (i = 0; i < bs; i++)
-      xd->above_seg_context[i] = ~(0xf << boffset);
-  } else if ((bwl == bsl) && (bhl < bsl)) {
-    for (i = 0; i < bs; i++)
-      xd->left_seg_context[i] = ~(0xe << boffset);
-    for (i = 0; i < bs; i++)
-      xd->above_seg_context[i] = ~(0xf << boffset);
-  }  else if ((bwl < bsl) && (bhl == bsl)) {
-    for (i = 0; i < bs; i++)
-      xd->left_seg_context[i] = ~(0xf << boffset);
-    for (i = 0; i < bs; i++)
-      xd->above_seg_context[i] = ~(0xe << boffset);
-  } else if ((bwl < bsl) && (bhl < bsl)) {
-    for (i = 0; i < bs; i++)
-      xd->left_seg_context[i] = ~(0xe << boffset);
-    for (i = 0; i < bs; i++)
-      xd->above_seg_context[i] = ~(0xe << boffset);
-  } else {
-    assert(0);
-  }
+  vpx_memset(xd->above_seg_context, pcvalue[bwl == bsl], bs);
+  vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
 }
 
 static INLINE int partition_plane_context(MACROBLOCKD *xd,
@@ -453,134 +352,57 @@ static INLINE int partition_plane_context(MACROBLOCKD *xd,
 
 static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
                                    PARTITION_TYPE partition) {
-  BLOCK_SIZE_TYPE subsize;
-  switch (partition) {
-    case PARTITION_NONE:
-      subsize = bsize;
-      break;
-    case PARTITION_HORZ:
-      if (bsize == BLOCK_SIZE_SB64X64)
-        subsize = BLOCK_SIZE_SB64X32;
-      else if (bsize == BLOCK_SIZE_SB32X32)
-        subsize = BLOCK_SIZE_SB32X16;
-      else if (bsize == BLOCK_SIZE_MB16X16)
-        subsize = BLOCK_SIZE_SB16X8;
-      else if (bsize == BLOCK_SIZE_SB8X8)
-        subsize = BLOCK_SIZE_SB8X4;
-      else
-        assert(0);
-      break;
-    case PARTITION_VERT:
-      if (bsize == BLOCK_SIZE_SB64X64)
-        subsize = BLOCK_SIZE_SB32X64;
-      else if (bsize == BLOCK_SIZE_SB32X32)
-        subsize = BLOCK_SIZE_SB16X32;
-      else if (bsize == BLOCK_SIZE_MB16X16)
-        subsize = BLOCK_SIZE_SB8X16;
-      else if (bsize == BLOCK_SIZE_SB8X8)
-        subsize = BLOCK_SIZE_SB4X8;
-      else
-        assert(0);
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_SIZE_SB64X64)
-        subsize = BLOCK_SIZE_SB32X32;
-      else if (bsize == BLOCK_SIZE_SB32X32)
-        subsize = BLOCK_SIZE_MB16X16;
-      else if (bsize == BLOCK_SIZE_MB16X16)
-        subsize = BLOCK_SIZE_SB8X8;
-      else if (bsize == BLOCK_SIZE_SB8X8)
-        subsize = BLOCK_SIZE_AB4X4;
-      else
-        assert(0);
-      break;
-    default:
-      assert(0);
-  }
+  BLOCK_SIZE_TYPE subsize = subsize_lookup[partition][bsize];
+  assert(subsize != BLOCK_SIZE_TYPES);
   return subsize;
 }
 
-// transform mapping
-static TX_TYPE txfm_map(MB_PREDICTION_MODE bmode) {
-  switch (bmode) {
-    case TM_PRED :
-    case D135_PRED :
-      return ADST_ADST;
+extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];
 
-    case V_PRED :
-    case D117_PRED :
-    case D63_PRED:
-      return ADST_DCT;
+static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
+                                      const MACROBLOCKD *xd, int ib) {
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-    case H_PRED :
-    case D153_PRED :
-    case D27_PRED :
-      return DCT_ADST;
+  if (plane_type != PLANE_TYPE_Y_WITH_DC ||
+      xd->lossless ||
+      mbmi->ref_frame[0] != INTRA_FRAME)
+    return DCT_DCT;
 
-    default:
-      return DCT_DCT;
-  }
+  return mode2txfm_map[mbmi->sb_type < BLOCK_SIZE_SB8X8 ?
+                       mi->bmi[ib].as_mode : mbmi->mode];
 }
 
-static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {
-  TX_TYPE tx_type;
-  MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  if (xd->lossless || mbmi->ref_frame[0] != INTRA_FRAME)
-    return DCT_DCT;
-  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
-    tx_type = txfm_map(mi->bmi[ib].as_mode.first);
-  } else {
-    assert(mbmi->mode <= TM_PRED);
-    tx_type = txfm_map(mbmi->mode);
-  }
-  return tx_type;
+static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
+                                      const MACROBLOCKD *xd) {
+  return plane_type == PLANE_TYPE_Y_WITH_DC ?
+             mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
 }
 
-static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, int ib) {
-  TX_TYPE tx_type = DCT_DCT;
-  if (xd->mode_info_context->mbmi.mode <= TM_PRED) {
-    tx_type = txfm_map(xd->mode_info_context->mbmi.mode);
-  }
-  return tx_type;
+static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type,
+                                        const MACROBLOCKD *xd) {
+  return plane_type == PLANE_TYPE_Y_WITH_DC ?
+             mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
 }
 
-static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, int ib) {
-  TX_TYPE tx_type = DCT_DCT;
-  if (xd->mode_info_context->mbmi.mode <= TM_PRED) {
-    tx_type = txfm_map(xd->mode_info_context->mbmi.mode);
+static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
+    xd->plane[i].subsampling_x = i ? ss_x : 0;
+    xd->plane[i].subsampling_y = i ? ss_y : 0;
   }
-  return tx_type;
+#if CONFIG_ALPHA
+  // TODO(jkoleszar): Using the Y w/h for now
+  xd->plane[3].subsampling_x = 0;
+  xd->plane[3].subsampling_y = 0;
+#endif
 }
 
-void vp9_setup_block_dptrs(MACROBLOCKD *xd,
-                           int subsampling_x, int subsampling_y);
-
-static TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
-  const TX_SIZE size = mbmi->txfm_size;
-
-  switch (mbmi->sb_type) {
-    case BLOCK_SIZE_SB64X64:
-      return size;
-    case BLOCK_SIZE_SB64X32:
-    case BLOCK_SIZE_SB32X64:
-    case BLOCK_SIZE_SB32X32:
-      if (size == TX_32X32)
-        return TX_16X16;
-      else
-        return size;
-    case BLOCK_SIZE_SB32X16:
-    case BLOCK_SIZE_SB16X32:
-    case BLOCK_SIZE_MB16X16:
-      if (size == TX_16X16)
-        return TX_8X8;
-      else
-        return size;
-    default:
-      return TX_4X4;
-  }
 
-  return size;
+static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
+  return MIN(mbmi->txfm_size, max_uv_txsize_lookup[mbmi->sb_type]);
 }
 
 struct plane_block_idx {
@@ -619,6 +441,16 @@ static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize,
   return 4 << (b_height_log2(bsize) - plane->subsampling_y);
 }
 
+static INLINE int plane_block_width_log2by4(
+    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
+  return (b_width_log2(bsize) - plane->subsampling_x);
+}
+
+static INLINE int plane_block_height_log2by4(
+    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
+  return (b_height_log2(bsize) - plane->subsampling_y);
+}
+
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   BLOCK_SIZE_TYPE bsize,
                                                   int ss_txfrm_size,
@@ -795,11 +627,11 @@ static int txfrm_block_to_raster_block(MACROBLOCKD *xd,
                                        int ss_txfrm_size) {
   const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
   const int txwl = ss_txfrm_size / 2;
-  const int tx_cols_lg2 = bwl - txwl;
-  const int tx_cols = 1 << tx_cols_lg2;
+  const int tx_cols_log2 = bwl - txwl;
+  const int tx_cols = 1 << tx_cols_log2;
   const int raster_mb = block >> ss_txfrm_size;
   const int x = (raster_mb & (tx_cols - 1)) << (txwl);
-  const int y = raster_mb >> tx_cols_lg2 << (txwl);
+  const int y = raster_mb >> tx_cols_log2 << (txwl);
   return x + (y << bwl);
 }
 
@@ -810,11 +642,11 @@ static void txfrm_block_to_raster_xy(MACROBLOCKD *xd,
                                      int *x, int *y) {
   const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
   const int txwl = ss_txfrm_size / 2;
-  const int tx_cols_lg2 = bwl - txwl;
-  const int tx_cols = 1 << tx_cols_lg2;
+  const int tx_cols_log2 = bwl - txwl;
+  const int tx_cols = 1 << tx_cols_log2;
   const int raster_mb = block >> ss_txfrm_size;
   *x = (raster_mb & (tx_cols - 1)) << (txwl);
-  *y = raster_mb >> tx_cols_lg2 << (txwl);
+  *y = raster_mb >> tx_cols_log2 << (txwl);
 }
 
 static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h
index 0d7babf..1796906 100644
--- a/libvpx/vp9/common/vp9_common.h
+++ b/libvpx/vp9/common/vp9_common.h
@@ -22,12 +22,11 @@
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 
-#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
+#define ROUND_POWER_OF_TWO(value, n) \
+    (((value) + (1 << ((n) - 1))) >> (n))
 
-/* If we don't want to use ROUND_POWER_OF_TWO macro
-static INLINE int16_t round_power_of_two(int16_t value, int n) {
-  return (value + (1 << (n - 1))) >> n;
-}*/
+#define ALIGN_POWER_OF_TWO(value, n) \
+    (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
 
 // Only need this for fixed-size arrays, for structs just assign.
 #define vp9_copy(dest, src) {            \
@@ -56,10 +55,35 @@ static INLINE double fclamp(double value, double low, double high) {
   return value < low ? low : (value > high ? high : value);
 }
 
-static INLINE int multiple8(int value) {
-  return (value + 7) & ~7;
+static int get_unsigned_bits(unsigned int num_values) {
+  int cat = 0;
+  if (num_values <= 1)
+    return 0;
+  num_values--;
+  while (num_values > 0) {
+    cat++;
+    num_values >>= 1;
+  }
+  return cat;
 }
 
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(cm, lval, expr) do { \
+  lval = (expr); \
+  if (!lval) \
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \
+                       "Failed to allocate "#lval" at %s:%d", \
+                       __FILE__, __LINE__); \
+  } while (0)
+#else
+#define CHECK_MEM_ERROR(cm, lval, expr) do { \
+  lval = (expr); \
+  if (!lval) \
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \
+                       "Failed to allocate "#lval); \
+  } while (0)
+#endif
+
 #define SYNC_CODE_0 0x49
 #define SYNC_CODE_1 0x83
 #define SYNC_CODE_2 0x42
diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c
new file mode 100644
index 0000000..dee44ec
--- /dev/null
+++ b/libvpx/vp9/common/vp9_common_data.c
@@ -0,0 +1,121 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_common_data.h"
+
+// Log 2 conversion lookup tables for block width and height
+const int b_width_log2_lookup[BLOCK_SIZE_TYPES] =
+  {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
+const int b_height_log2_lookup[BLOCK_SIZE_TYPES] =
+  {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
+const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
+  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
+const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES] =
+  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
+// Log 2 conversion lookup tables for modeinfo width and height
+const int mi_width_log2_lookup[BLOCK_SIZE_TYPES] =
+  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
+const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
+  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
+const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] =
+  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
+const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] =
+  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
+
+const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
+  {  // 4X4
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID
+  }, {  // 8X8
+  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID
+  }, {  // 16X16
+  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID
+  }, {  // 32X32
+  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT,
+    PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID
+  }, {  // 64X64
+  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ,
+    PARTITION_NONE
+  }
+};
+
+const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = {
+  {     // PARTITION_NONE
+    BLOCK_SIZE_AB4X4, BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB8X4,
+    BLOCK_SIZE_SB8X8, BLOCK_SIZE_SB8X16, BLOCK_SIZE_SB16X8,
+    BLOCK_SIZE_MB16X16, BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB32X16,
+    BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64, BLOCK_SIZE_SB64X32,
+    BLOCK_SIZE_SB64X64,
+  }, {  // PARTITION_HORZ
+    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_SIZE_SB8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_SIZE_SB16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_SIZE_SB32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_SIZE_SB64X32,
+  }, {  // PARTITION_VERT
+    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_SIZE_SB4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_SIZE_SB8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_SIZE_SB16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_SIZE_SB32X64,
+  }, {  // PARTITION_SPLIT
+    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_SIZE_AB4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_SIZE_SB8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_SIZE_MB16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_SIZE_SB32X32,
+  }
+};
+
+const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES] = {
+  TX_4X4, TX_4X4, TX_4X4,
+  TX_8X8, TX_8X8, TX_8X8,
+  TX_16X16, TX_16X16, TX_16X16,
+  TX_32X32, TX_32X32, TX_32X32, TX_32X32
+};
+const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES] = {
+  TX_4X4, TX_4X4, TX_4X4,
+  TX_4X4, TX_4X4, TX_4X4,
+  TX_8X8, TX_8X8, TX_8X8,
+  TX_16X16, TX_16X16, TX_16X16, TX_32X32
+};
+
+const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5] = {
+  {BLOCK_SIZE_AB4X4,   BLOCK_SIZE_SB4X8,   BLOCK_SIZE_SB4X8,
+    BLOCK_SIZE_SB4X8,   BLOCK_SIZE_SB4X8},
+  {BLOCK_SIZE_SB8X4,   BLOCK_SIZE_SB8X8,   BLOCK_SIZE_SB8X16,
+    BLOCK_SIZE_SB8X16,  BLOCK_SIZE_SB8X16},
+  {BLOCK_SIZE_SB16X8,  BLOCK_SIZE_SB16X8,  BLOCK_SIZE_MB16X16,
+    BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB16X32},
+  {BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16,
+    BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64},
+  {BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32,
+    BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X64}
+};
diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h
new file mode 100644
index 0000000..8b0f8a5
--- /dev/null
+++ b/libvpx/vp9/common/vp9_common_data.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_COMMON_DATA_H_
+#define VP9_COMMON_VP9_COMMON_DATA_H_
+
+#include "vp9/common/vp9_enums.h"
+
+extern const int b_width_log2_lookup[BLOCK_SIZE_TYPES];
+extern const int b_height_log2_lookup[BLOCK_SIZE_TYPES];
+extern const int mi_width_log2_lookup[BLOCK_SIZE_TYPES];
+extern const int mi_height_log2_lookup[BLOCK_SIZE_TYPES];
+extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES];
+extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES];
+extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES];
+extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES];
+extern const PARTITION_TYPE
+  partition_lookup[][BLOCK_SIZE_TYPES];
+
+
+extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES];
+extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES];
+extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES];
+extern const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5];
+
+#endif    // VP9_COMMON_VP9_COMMON_DATA_H
diff --git a/libvpx/vp9/common/vp9_convolve.c b/libvpx/vp9/common/vp9_convolve.c
index 46ae503..6f1e418 100644
--- a/libvpx/vp9/common/vp9_convolve.c
+++ b/libvpx/vp9/common/vp9_convolve.c
@@ -38,8 +38,8 @@
  */
  #define ALIGN_FILTERS_256 1
 
-static void convolve_horiz_c(const uint8_t *src, int src_stride,
-                             uint8_t *dst, int dst_stride,
+static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x0, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h, int taps) {
@@ -80,8 +80,8 @@ static void convolve_horiz_c(const uint8_t *src, int src_stride,
   }
 }
 
-static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride,
+static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x0, int x_step_q4,
                                  const int16_t *filter_y, int y_step_q4,
                                  int w, int h, int taps) {
@@ -122,8 +122,8 @@ static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
   }
 }
 
-static void convolve_vert_c(const uint8_t *src, int src_stride,
-                            uint8_t *dst, int dst_stride,
+static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
                             const int16_t *filter_y0, int y_step_q4,
                             int w, int h, int taps) {
@@ -164,8 +164,8 @@ static void convolve_vert_c(const uint8_t *src, int src_stride,
   }
 }
 
-static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
+static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
                                 const int16_t *filter_y0, int y_step_q4,
                                 int w, int h, int taps) {
@@ -207,8 +207,8 @@ static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
   }
 }
 
-static void convolve_c(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
+static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
+                       uint8_t *dst, ptrdiff_t dst_stride,
                        const int16_t *filter_x, int x_step_q4,
                        const int16_t *filter_y, int y_step_q4,
                        int w, int h, int taps) {
@@ -217,12 +217,13 @@ static void convolve_c(const uint8_t *src, int src_stride,
    * h == 64, taps == 8.
    */
   uint8_t temp[64 * 135];
-  int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;
+  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
 
   assert(w <= 64);
   assert(h <= 64);
   assert(taps <= 8);
   assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
 
   if (intermediate_height < h)
     intermediate_height = h;
@@ -236,8 +237,8 @@ static void convolve_c(const uint8_t *src, int src_stride,
                   w, h, taps);
 }
 
-static void convolve_avg_c(const uint8_t *src, int src_stride,
-                           uint8_t *dst, int dst_stride,
+static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h, int taps) {
@@ -246,12 +247,13 @@ static void convolve_avg_c(const uint8_t *src, int src_stride,
    * h == 64, taps == 8.
    */
   uint8_t temp[64 * 135];
-  int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;
+  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
 
   assert(w <= 64);
   assert(h <= 64);
   assert(taps <= 8);
   assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
 
   if (intermediate_height < h)
     intermediate_height = h;
@@ -265,8 +267,8 @@ static void convolve_avg_c(const uint8_t *src, int src_stride,
                       w, h, taps);
 }
 
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
-                           uint8_t *dst, int dst_stride,
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h) {
@@ -275,8 +277,8 @@ void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
                    w, h, 8);
 }
 
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h) {
@@ -285,8 +287,8 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
                        w, h, 8);
 }
 
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
-                          uint8_t *dst, int dst_stride,
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
                           const int16_t *filter_y, int y_step_q4,
                           int w, int h) {
@@ -295,8 +297,8 @@ void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
                   w, h, 8);
 }
 
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
@@ -305,8 +307,8 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
                       w, h, 8);
 }
 
-void vp9_convolve8_c(const uint8_t *src, int src_stride,
-                     uint8_t *dst, int dst_stride,
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
                      const int16_t *filter_x, int x_step_q4,
                      const int16_t *filter_y, int y_step_q4,
                      int w, int h) {
@@ -315,8 +317,8 @@ void vp9_convolve8_c(const uint8_t *src, int src_stride,
              w, h, 8);
 }
 
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
@@ -337,33 +339,25 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
                    w, h);
 }
 
-void vp9_convolve_copy(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int filter_x_stride,
-                       const int16_t *filter_y, int filter_y_stride,
-                       int w, int h) {
-  if (w == 16 && h == 16) {
-    vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
-  } else if (w == 8 && h == 8) {
-    vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
-  } else if (w == 8 && h == 4) {
-    vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
-  } else {
-    int r;
-
-    for (r = h; r > 0; --r) {
-      memcpy(dst, src, w);
-      src += src_stride;
-      dst += dst_stride;
-    }
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int filter_x_stride,
+                         const int16_t *filter_y, int filter_y_stride,
+                         int w, int h) {
+  int r;
+
+  for (r = h; r > 0; --r) {
+    memcpy(dst, src, w);
+    src += src_stride;
+    dst += dst_stride;
   }
 }
 
-void vp9_convolve_avg(const uint8_t *src, int src_stride,
-                      uint8_t *dst, int dst_stride,
-                      const int16_t *filter_x, int filter_x_stride,
-                      const int16_t *filter_y, int filter_y_stride,
-                      int w, int h) {
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int filter_x_stride,
+                        const int16_t *filter_y, int filter_y_stride,
+                        int w, int h) {
   int x, y;
 
   for (y = 0; y < h; ++y) {
diff --git a/libvpx/vp9/common/vp9_convolve.h b/libvpx/vp9/common/vp9_convolve.h
index 0596080..3de8111 100644
--- a/libvpx/vp9/common/vp9_convolve.h
+++ b/libvpx/vp9/common/vp9_convolve.h
@@ -13,26 +13,12 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h);
 
-// Not a convolution, a block copy conforming to the convolution prototype
-void vp9_convolve_copy(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h);
-
-// Not a convolution, a block average conforming to the convolution prototype
-void vp9_convolve_avg(const uint8_t *src, int src_stride,
-                      uint8_t *dst, int dst_stride,
-                      const int16_t *filter_x, int x_step_q4,
-                      const int16_t *filter_y, int y_step_q4,
-                      int w, int h);
-
 struct subpix_fn_table {
   const int16_t (*filter_x)[8];
   const int16_t (*filter_y)[8];
diff --git a/libvpx/vp9/common/vp9_debugmodes.c b/libvpx/vp9/common/vp9_debugmodes.c
index 5841f80..370ebe8 100644
--- a/libvpx/vp9/common/vp9_debugmodes.c
+++ b/libvpx/vp9/common/vp9_debugmodes.c
@@ -11,126 +11,68 @@
 #include <stdio.h>
 
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_onyxc_int.h"
 
-void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,
-                                        int frame, char *file) {
+static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) {
+  fprintf(f, "%s", str);
+  fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame,
+          cm->show_frame, cm->base_qindex);
+}
+/* This function dereferences a pointer to the mbmi structure
+ * and uses the passed in member offset to print out the value of an integer
+ * for each mbmi member value in the mi structure.
+ */
+static void print_mi_data(VP9_COMMON *common, FILE *file, char *descriptor,
+                          size_t member_offset) {
   int mi_row;
   int mi_col;
   int mi_index = 0;
-  FILE *mvs = fopen(file, "a");
-
-  // Print out the macroblock Y modes
-  fprintf(mvs, "SB Types for Frame %d\n", frame);
-
-  for (mi_row = 0; mi_row < rows; mi_row++) {
-    for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[mi_index].mbmi.sb_type);
-
-      mi_index++;
-    }
-
-    fprintf(mvs, "\n");
-    mi_index += 8;
-  }
+  MODE_INFO *mi = common->mi;
+  int rows = common->mi_rows;
+  int cols = common->mi_cols;
+  char prefix = descriptor[0];
 
-  // Print out the macroblock Y modes
-  fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+  log_frame_info(common, descriptor, file);
   mi_index = 0;
   for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(file, "%c ", prefix);
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[mi_index].mbmi.mode);
-
+      fprintf(file, "%2d ",
+              *((int*) ((char *) (&mi[mi_index].mbmi) + member_offset)));
       mi_index++;
     }
-
-    fprintf(mvs, "\n");
+    fprintf(file, "\n");
     mi_index += 8;
   }
-
-  fprintf(mvs, "\n");
-
-  mi_index = 0;
-  fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
-
-  for (mi_row = 0; mi_row < rows; mi_row++) {
-    for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[mi_index].mbmi.ref_frame[0]);
-
-      mi_index++;
-    }
-
-    fprintf(mvs, "\n");
-    mi_index += 8;
-  }
-  fprintf(mvs, "\n");
-
-  mi_index = 0;
-  fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
-
+  fprintf(file, "\n");
+}
+void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
+  int mi_row;
+  int mi_col;
+  int mi_index = 0;
+  FILE *mvs = fopen(file, "a");
+  MODE_INFO *mi = cm->mi;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
+
+  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
+  print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
+  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, mb_skip_coeff));
+  print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
+  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, txfm_size));
+  print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
+
+  log_frame_info(cm, "Vectors ",mvs);
   for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs,"V ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row,
               mi[mi_index].mbmi.mv[0].as_mv.col);
-
       mi_index++;
     }
-
     fprintf(mvs, "\n");
     mi_index += 8;
   }
-
-  fprintf(mvs, "\n");
-
-  /* print out the macroblock txform sizes */
-  mi_index = 0;
-  fprintf(mvs, "TXFM size for Frame %d\n", frame);
-
-  for (mi_row = 0; mi_row < rows; mi_row++) {
-    for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[mi_index].mbmi.txfm_size);
-
-      mi_index++;
-    }
-
-    mi_index += 8;
-    fprintf(mvs, "\n");
-  }
-
-  fprintf(mvs, "\n");
-
-  /* print out the macroblock UV modes */
-  mi_index = 0;
-  fprintf(mvs, "UV Modes for Frame %d\n", frame);
-
-  for (mi_row = 0; mi_row < rows; mi_row++) {
-    for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[mi_index].mbmi.uv_mode);
-
-      mi_index++;
-    }
-
-    mi_index += 8;
-    fprintf(mvs, "\n");
-  }
-
-  fprintf(mvs, "\n");
-
-  /* print out the macroblock mvs */
-  mi_index = 0;
-  fprintf(mvs, "MVs for Frame %d\n", frame);
-
-  for (mi_row = 0; mi_row < rows; mi_row++) {
-    for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%5d:%-5d", mi[mi_index].mbmi.mv[0].as_mv.row / 2,
-              mi[mi_index].mbmi.mv[0].as_mv.col / 2);
-
-      mi_index++;
-    }
-
-    mi_index += 8;
-    fprintf(mvs, "\n");
-  }
-
   fprintf(mvs, "\n");
 
   fclose(mvs);
diff --git a/libvpx/vp9/common/vp9_default_coef_probs.h b/libvpx/vp9/common/vp9_default_coef_probs.h
index 1954093..185fced 100644
--- a/libvpx/vp9/common/vp9_default_coef_probs.h
+++ b/libvpx/vp9/common/vp9_default_coef_probs.h
@@ -8,695 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
 */
 
-
 /*Generated file, included by vp9_entropy.c*/
-
-#if CONFIG_BALANCED_COEFTREE
-static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
-  { /* block Type 0 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        {   6, 213, 178 },
-        {  26, 113, 132 },
-        {  34,  17,  68 }
-      }, { /* Coeff Band 1 */
-        {  66,  96, 178 },
-        {  63,  96, 174 },
-        {  67,  54, 154 },
-        {  62,  28, 126 },
-        {  48,   9,  84 },
-        {  20,   1,  32 }
-      }, { /* Coeff Band 2 */
-        {  64, 144, 206 },
-        {  70,  99, 191 },
-        {  69,  36, 152 },
-        {  55,   9, 106 },
-        {  35,   1,  60 },
-        {  14,   1,  22 }
-      }, { /* Coeff Band 3 */
-        {  82, 154, 222 },
-        {  83, 112, 205 },
-        {  81,  31, 164 },
-        {  62,   7, 118 },
-        {  42,   1,  74 },
-        {  18,   1,  30 }
-      }, { /* Coeff Band 4 */
-        {  52, 179, 233 },
-        {  64, 132, 214 },
-        {  73,  36, 170 },
-        {  59,   8, 116 },
-        {  38,   1,  65 },
-        {  15,   1,  26 }
-      }, { /* Coeff Band 5 */
-        {  29, 175, 238 },
-        {  26, 169, 223 },
-        {  41,  80, 182 },
-        {  39,  32, 127 },
-        {  26,  10,  69 },
-        {  11,   2,  28 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        {  21, 226, 234 },
-        {  52, 182, 212 },
-        {  80, 112, 177 }
-      }, { /* Coeff Band 1 */
-        { 111, 164, 243 },
-        {  88, 152, 231 },
-        {  90,  43, 186 },
-        {  70,  12, 132 },
-        {  44,   2,  76 },
-        {  19,   1,  33 }
-      }, { /* Coeff Band 2 */
-        {  96, 185, 246 },
-        {  99, 127, 231 },
-        {  88,  21, 177 },
-        {  64,   5, 122 },
-        {  38,   1,  69 },
-        {  18,   1,  30 }
-      }, { /* Coeff Band 3 */
-        {  84, 206, 249 },
-        {  94, 147, 237 },
-        {  95,  33, 187 },
-        {  71,   8, 131 },
-        {  47,   1,  83 },
-        {  26,   1,  44 }
-      }, { /* Coeff Band 4 */
-        {  38, 221, 252 },
-        {  58, 177, 241 },
-        {  78,  46, 188 },
-        {  59,   9, 122 },
-        {  34,   1,  66 },
-        {  18,   1,  34 }
-      }, { /* Coeff Band 5 */
-        {  21, 216, 253 },
-        {  21, 206, 244 },
-        {  42,  93, 200 },
-        {  43,  41, 146 },
-        {  36,  13,  93 },
-        {  31,   1,  55 }
-      }
-    }
-  }, { /* block Type 1 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        {   7, 213, 219 },
-        {  23, 139, 182 },
-        {  38,  60, 125 }
-      }, { /* Coeff Band 1 */
-        {  69, 156, 220 },
-        {  52, 178, 213 },
-        {  69, 111, 190 },
-        {  69,  58, 155 },
-        {  58,  21, 104 },
-        {  39,   7,  60 }
-      }, { /* Coeff Band 2 */
-        {  68, 189, 228 },
-        {  70, 158, 221 },
-        {  83,  64, 189 },
-        {  73,  18, 141 },
-        {  48,   4,  88 },
-        {  23,   1,  41 }
-      }, { /* Coeff Band 3 */
-        {  99, 194, 236 },
-        {  91, 138, 224 },
-        {  91,  53, 189 },
-        {  74,  20, 142 },
-        {  48,   6,  90 },
-        {  22,   1,  41 }
-      }, { /* Coeff Band 4 */
-        {  52, 203, 244 },
-        {  60, 168, 231 },
-        {  75,  62, 189 },
-        {  61,  18, 132 },
-        {  38,   4,  72 },
-        {  17,   1,  39 }
-      }, { /* Coeff Band 5 */
-        {  33, 192, 247 },
-        {  31, 185, 234 },
-        {  46,  85, 185 },
-        {  39,  35, 132 },
-        {  28,  15,  80 },
-        {  13,   5,  38 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        {   5, 247, 246 },
-        {  28, 209, 228 },
-        {  65, 137, 203 }
-      }, { /* Coeff Band 1 */
-        {  69, 208, 250 },
-        {  54, 207, 242 },
-        {  81,  92, 204 },
-        {  70,  54, 153 },
-        {  58,  40, 108 },
-        {  58,  35,  71 }
-      }, { /* Coeff Band 2 */
-        {  65, 215, 250 },
-        {  72, 185, 239 },
-        {  92,  50, 197 },
-        {  75,  14, 147 },
-        {  49,   2,  99 },
-        {  26,   1,  53 }
-      }, { /* Coeff Band 3 */
-        {  70, 220, 251 },
-        {  76, 186, 241 },
-        {  90,  65, 198 },
-        {  75,  26, 151 },
-        {  58,  12, 112 },
-        {  34,   6,  49 }
-      }, { /* Coeff Band 4 */
-        {  34, 224, 253 },
-        {  44, 204, 245 },
-        {  69,  85, 204 },
-        {  64,  31, 150 },
-        {  44,   2,  78 },
-        {   1,   1, 128 }
-      }, { /* Coeff Band 5 */
-        {  25, 216, 253 },
-        {  21, 215, 248 },
-        {  47, 108, 214 },
-        {  47,  48, 160 },
-        {  26,  20,  90 },
-        {  64, 171, 128 }
-      }
-    }
-  }
-};
-static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = {
-  { /* block Type 0 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        {   9, 203, 199 },
-        {  26,  92, 128 },
-        {  28,  11,  55 }
-      }, { /* Coeff Band 1 */
-        {  99,  54, 160 },
-        {  78,  99, 155 },
-        {  80,  44, 138 },
-        {  71,  17, 115 },
-        {  51,   5,  80 },
-        {  27,   1,  40 }
-      }, { /* Coeff Band 2 */
-        { 135,  81, 190 },
-        { 113,  61, 182 },
-        {  93,  16, 153 },
-        {  70,   4, 115 },
-        {  41,   1,  68 },
-        {  16,   1,  27 }
-      }, { /* Coeff Band 3 */
-        { 155, 103, 214 },
-        { 129,  48, 199 },
-        {  95,  10, 159 },
-        {  63,   1, 110 },
-        {  32,   1,  58 },
-        {  12,   1,  21 }
-      }, { /* Coeff Band 4 */
-        { 163, 149, 231 },
-        { 137,  69, 213 },
-        {  95,  11, 164 },
-        {  62,   3, 108 },
-        {  32,   1,  57 },
-        {  13,   1,  22 }
-      }, { /* Coeff Band 5 */
-        { 136, 189, 239 },
-        { 123, 102, 223 },
-        {  97,  19, 170 },
-        {  66,   4, 111 },
-        {  38,   1,  60 },
-        {  18,   1,  26 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        {  24, 226, 244 },
-        {  54, 178, 211 },
-        {  80,  74, 152 }
-      }, { /* Coeff Band 1 */
-        { 145, 153, 236 },
-        { 101, 163, 223 },
-        { 108,  50, 187 },
-        {  90,  22, 145 },
-        {  66,   8,  97 },
-        {  42,   4,  50 }
-      }, { /* Coeff Band 2 */
-        { 150, 159, 238 },
-        { 128,  90, 218 },
-        {  94,   9, 163 },
-        {  64,   3, 110 },
-        {  34,   1,  61 },
-        {  13,   1,  24 }
-      }, { /* Coeff Band 3 */
-        { 151, 162, 242 },
-        { 135,  80, 222 },
-        {  93,   9, 166 },
-        {  61,   3, 111 },
-        {  31,   1,  59 },
-        {  12,   1,  22 }
-      }, { /* Coeff Band 4 */
-        { 161, 170, 245 },
-        { 140,  84, 228 },
-        {  99,   8, 174 },
-        {  64,   1, 116 },
-        {  34,   1,  63 },
-        {  14,   1,  26 }
-      }, { /* Coeff Band 5 */
-        { 138, 197, 246 },
-        { 127, 109, 233 },
-        { 100,  16, 179 },
-        {  66,   3, 119 },
-        {  37,   1,  66 },
-        {  16,   1,  30 }
-      }
-    }
-  }, { /* block Type 1 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        {   6, 216, 212 },
-        {  25, 134, 171 },
-        {  43,  48, 118 }
-      }, { /* Coeff Band 1 */
-        {  93, 112, 209 },
-        {  66, 159, 206 },
-        {  82,  78, 184 },
-        {  75,  28, 148 },
-        {  46,   4,  82 },
-        {  18,   1,  28 }
-      }, { /* Coeff Band 2 */
-        { 108, 148, 220 },
-        {  90, 130, 216 },
-        {  92,  40, 186 },
-        {  73,  10, 135 },
-        {  46,   1,  79 },
-        {  20,   1,  35 }
-      }, { /* Coeff Band 3 */
-        { 125, 173, 232 },
-        { 109, 117, 223 },
-        {  97,  31, 183 },
-        {  71,   7, 127 },
-        {  44,   1,  76 },
-        {  21,   1,  36 }
-      }, { /* Coeff Band 4 */
-        { 133, 195, 236 },
-        { 112, 121, 224 },
-        {  97,  23, 178 },
-        {  69,   3, 122 },
-        {  42,   1,  72 },
-        {  19,   1,  34 }
-      }, { /* Coeff Band 5 */
-        { 132, 180, 238 },
-        { 119, 102, 225 },
-        { 101,  18, 179 },
-        {  71,   3, 124 },
-        {  42,   1,  70 },
-        {  17,   1,  28 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        {   5, 242, 250 },
-        {  26, 198, 226 },
-        {  58,  98, 168 }
-      }, { /* Coeff Band 1 */
-        {  82, 201, 246 },
-        {  50, 219, 237 },
-        {  94, 107, 205 },
-        {  89,  61, 167 },
-        {  77,  31, 131 },
-        {  57,  14,  91 }
-      }, { /* Coeff Band 2 */
-        {  99, 202, 247 },
-        {  96, 165, 234 },
-        { 100,  31, 190 },
-        {  72,   8, 131 },
-        {  41,   1,  72 },
-        {  14,   1,  24 }
-      }, { /* Coeff Band 3 */
-        { 108, 204, 248 },
-        { 107, 156, 235 },
-        { 103,  27, 186 },
-        {  71,   4, 124 },
-        {  39,   1,  66 },
-        {  14,   1,  19 }
-      }, { /* Coeff Band 4 */
-        { 120, 211, 248 },
-        { 118, 149, 234 },
-        { 107,  19, 182 },
-        {  72,   3, 126 },
-        {  40,   1,  69 },
-        {  16,   1,  24 }
-      }, { /* Coeff Band 5 */
-        { 127, 199, 245 },
-        { 122, 125, 232 },
-        { 112,  20, 186 },
-        {  82,   3, 136 },
-        {  55,   1,  88 },
-        {  10,   1,  38 }
-      }
-    }
-  }
-};
-static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = {
-  { /* block Type 0 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        {  25,   9, 101 },
-        {  25,   2,  67 },
-        {  15,   1,  28 }
-      }, { /* Coeff Band 1 */
-        {  67,  30, 118 },
-        {  61,  56, 116 },
-        {  60,  31, 105 },
-        {  52,  11,  85 },
-        {  34,   2,  54 },
-        {  14,   1,  22 }
-      }, { /* Coeff Band 2 */
-        { 107,  58, 149 },
-        {  92,  53, 147 },
-        {  78,  14, 123 },
-        {  56,   3,  87 },
-        {  35,   1,  56 },
-        {  17,   1,  27 }
-      }, { /* Coeff Band 3 */
-        { 142,  61, 171 },
-        { 111,  30, 162 },
-        {  80,   4, 128 },
-        {  53,   1,  87 },
-        {  31,   1,  52 },
-        {  14,   1,  24 }
-      }, { /* Coeff Band 4 */
-        { 171,  73, 200 },
-        { 129,  28, 184 },
-        {  86,   3, 140 },
-        {  54,   1,  90 },
-        {  28,   1,  49 },
-        {  12,   1,  21 }
-      }, { /* Coeff Band 5 */
-        { 193, 129, 227 },
-        { 148,  28, 200 },
-        {  90,   2, 144 },
-        {  53,   1,  90 },
-        {  28,   1,  50 },
-        {  13,   1,  22 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        {  60,   7, 234 },
-        {  64,   4, 184 },
-        {  56,   1, 104 }
-      }, { /* Coeff Band 1 */
-        { 150, 111, 210 },
-        {  87, 185, 202 },
-        { 101,  81, 177 },
-        {  90,  34, 142 },
-        {  67,  11,  95 },
-        {  38,   2,  51 }
-      }, { /* Coeff Band 2 */
-        { 153, 139, 218 },
-        { 120,  72, 195 },
-        {  90,  11, 147 },
-        {  63,   3, 101 },
-        {  39,   1,  61 },
-        {  20,   1,  33 }
-      }, { /* Coeff Band 3 */
-        { 171, 132, 223 },
-        { 131,  56, 200 },
-        {  92,   6, 147 },
-        {  58,   1,  95 },
-        {  32,   1,  52 },
-        {  14,   1,  23 }
-      }, { /* Coeff Band 4 */
-        { 183, 137, 227 },
-        { 139,  48, 204 },
-        {  91,   3, 148 },
-        {  55,   1,  91 },
-        {  28,   1,  47 },
-        {  13,   1,  21 }
-      }, { /* Coeff Band 5 */
-        { 198, 149, 234 },
-        { 153,  32, 208 },
-        {  95,   2, 148 },
-        {  55,   1,  90 },
-        {  30,   1,  51 },
-        {  16,   1,  25 }
-      }
-    }
-  }, { /* block Type 1 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        {   7, 209, 217 },
-        {  31, 106, 151 },
-        {  40,  21,  86 }
-      }, { /* Coeff Band 1 */
-        { 101,  71, 184 },
-        {  74, 131, 177 },
-        {  88,  50, 158 },
-        {  78,  16, 129 },
-        {  51,   2,  82 },
-        {  18,   1,  29 }
-      }, { /* Coeff Band 2 */
-        { 116, 115, 199 },
-        { 102,  88, 191 },
-        {  94,  22, 160 },
-        {  74,   6, 122 },
-        {  47,   1,  77 },
-        {  18,   1,  30 }
-      }, { /* Coeff Band 3 */
-        { 157, 124, 210 },
-        { 130,  53, 201 },
-        { 102,  10, 165 },
-        {  73,   1, 120 },
-        {  42,   1,  69 },
-        {  16,   1,  27 }
-      }, { /* Coeff Band 4 */
-        { 174, 147, 225 },
-        { 134,  67, 212 },
-        { 100,  10, 168 },
-        {  66,   1, 111 },
-        {  36,   1,  60 },
-        {  16,   1,  27 }
-      }, { /* Coeff Band 5 */
-        { 185, 165, 232 },
-        { 147,  56, 214 },
-        { 105,   5, 165 },
-        {  66,   1, 108 },
-        {  35,   1,  59 },
-        {  16,   1,  27 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        {   3, 232, 245 },
-        {  18, 162, 210 },
-        {  38,  64, 131 }
-      }, { /* Coeff Band 1 */
-        {  84, 187, 239 },
-        {  35, 231, 231 },
-        {  82, 150, 209 },
-        {  87,  97, 181 },
-        {  81,  64, 151 },
-        {  67,  60, 119 }
-      }, { /* Coeff Band 2 */
-        { 107, 185, 239 },
-        { 100, 149, 224 },
-        { 107,  34, 185 },
-        {  83,  12, 141 },
-        {  49,   4,  92 },
-        {  21,   1,  40 }
-      }, { /* Coeff Band 3 */
-        { 125, 184, 243 },
-        { 121, 127, 228 },
-        { 113,  25, 185 },
-        {  82,   6, 134 },
-        {  48,   1,  82 },
-        {  26,   1,  38 }
-      }, { /* Coeff Band 4 */
-        { 143, 185, 245 },
-        { 133, 115, 231 },
-        { 114,  14, 184 },
-        {  77,   3, 126 },
-        {  43,   1,  68 },
-        {  34,   1,  40 }
-      }, { /* Coeff Band 5 */
-        { 170, 194, 241 },
-        { 151,  80, 226 },
-        { 118,   9, 180 },
-        {  81,   1, 130 },
-        {  51,   1,  78 },
-        {  18,   1,  49 }
-      }
-    }
-  }
-};
-static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
-  { /* block Type 0 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        {  29,  42, 137 },
-        {  26,   3,  60 },
-        {  13,   1,  23 }
-      }, { /* Coeff Band 1 */
-        {  69,  36, 122 },
-        {  63,  57, 123 },
-        {  60,  33, 112 },
-        {  52,  11,  90 },
-        {  32,   2,  52 },
-        {  10,   1,  15 }
-      }, { /* Coeff Band 2 */
-        { 107,  55, 143 },
-        {  86,  69, 143 },
-        {  74,  24, 116 },
-        {  52,   5,  78 },
-        {  29,   1,  44 },
-        {  12,   1,  18 }
-      }, { /* Coeff Band 3 */
-        { 137,  71, 160 },
-        { 107,  34, 152 },
-        {  73,   6, 114 },
-        {  44,   1,  69 },
-        {  25,   1,  40 },
-        {  12,   1,  18 }
-      }, { /* Coeff Band 4 */
-        { 165,  70, 174 },
-        { 118,  24, 159 },
-        {  74,   3, 117 },
-        {  45,   1,  73 },
-        {  26,   1,  43 },
-        {  12,   1,  19 }
-      }, { /* Coeff Band 5 */
-        { 220,  93, 223 },
-        { 153,  10, 187 },
-        {  86,   2, 131 },
-        {  49,   1,  79 },
-        {  26,   1,  43 },
-        {  12,   1,  20 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        {  30,  58, 227 },
-        {  35,  10, 172 },
-        {  24,  23, 112 }
-      }, { /* Coeff Band 1 */
-        { 117, 145, 219 },
-        {  51, 221, 216 },
-        {  75, 169, 196 },
-        {  88,  96, 165 },
-        {  77,  43, 117 },
-        {  53,  18,  60 }
-      }, { /* Coeff Band 2 */
-        { 128, 176, 225 },
-        { 108, 114, 202 },
-        {  92,  19, 152 },
-        {  65,   4, 103 },
-        {  38,   1,  61 },
-        {  19,   1,  30 }
-      }, { /* Coeff Band 3 */
-        { 146, 184, 228 },
-        { 122,  95, 205 },
-        {  92,  11, 149 },
-        {  62,   1,  98 },
-        {  35,   1,  57 },
-        {  17,   1,  26 }
-      }, { /* Coeff Band 4 */
-        { 165, 192, 230 },
-        { 132,  81, 206 },
-        {  93,   6, 147 },
-        {  58,   1,  94 },
-        {  32,   1,  52 },
-        {  15,   1,  24 }
-      }, { /* Coeff Band 5 */
-        { 204, 223, 234 },
-        { 156,  49, 204 },
-        {  97,   3, 145 },
-        {  59,   1,  92 },
-        {  33,   1,  52 },
-        {  15,   1,  24 }
-      }
-    }
-  }, { /* block Type 1 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        {   7, 184, 200 },
-        {  25,  67, 113 },
-        {  30,   9,  59 }
-      }, { /* Coeff Band 1 */
-        {  92,  42, 158 },
-        {  65, 121, 159 },
-        {  77,  56, 146 },
-        {  70,  22, 120 },
-        {  47,   4,  76 },
-        {  18,   1,  26 }
-      }, { /* Coeff Band 2 */
-        { 113,  81, 177 },
-        {  96,  75, 167 },
-        {  84,  24, 136 },
-        {  63,   8, 100 },
-        {  37,   1,  58 },
-        {  13,   1,  19 }
-      }, { /* Coeff Band 3 */
-        { 147,  85, 194 },
-        { 119,  36, 178 },
-        {  88,   8, 139 },
-        {  59,   1,  93 },
-        {  31,   1,  49 },
-        {  10,   1,  18 }
-      }, { /* Coeff Band 4 */
-        { 169, 108, 210 },
-        { 131,  41, 191 },
-        {  92,   5, 144 },
-        {  56,   1,  88 },
-        {  29,   1,  47 },
-        {  14,   1,  22 }
-      }, { /* Coeff Band 5 */
-        { 210, 106, 223 },
-        { 148,  14, 192 },
-        {  89,   2, 138 },
-        {  52,   1,  84 },
-        {  29,   1,  47 },
-        {  14,   1,  23 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        {   3, 207, 245 },
-        {  12, 102, 213 },
-        {  18,  33, 144 }
-      }, { /* Coeff Band 1 */
-        {  85, 205, 245 },
-        {  18, 249, 242 },
-        {  59, 221, 229 },
-        {  91, 166, 213 },
-        {  88, 117, 183 },
-        {  70,  95, 149 }
-      }, { /* Coeff Band 2 */
-        { 114, 193, 241 },
-        { 104, 155, 221 },
-        { 100,  33, 181 },
-        {  78,  10, 132 },
-        {  43,   2,  75 },
-        {  15,   1,  48 }
-      }, { /* Coeff Band 3 */
-        { 118, 198, 244 },
-        { 117, 142, 224 },
-        { 111,  25, 179 },
-        {  83,   4, 134 },
-        {  57,   1,  84 },
-        {   1,   1,   1 }
-      }, { /* Coeff Band 4 */
-        { 144, 201, 248 },
-        { 136, 130, 234 },
-        { 124,  12, 188 },
-        {  83,   1, 130 },
-        {  61,   1,  66 },
-        {  64, 171, 128 }
-      }, { /* Coeff Band 5 */
-        { 174, 227, 250 },
-        { 165, 118, 242 },
-        { 132,  21, 197 },
-        {  84,   3, 134 },
-        {  70,   1,  69 },
-        {   1,   1,   1 }
-      }
-    }
-  }
-};
-#else
 static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
   { /* block Type 0 */
     { /* Intra */
@@ -1381,4 +693,4 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
     }
   }
 };
-#endif
+
diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c
index 080867e..0ad0dbc 100644
--- a/libvpx/vp9/common/vp9_entropy.c
+++ b/libvpx/vp9/common/vp9_entropy.c
@@ -15,6 +15,8 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
 
+#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
+
 DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -50,28 +52,28 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
-DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
   0,  4,  1,  5,
   8,  2, 12,  9,
   3,  6, 13, 10,
   7, 14, 11, 15,
 };
 
-DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
   0,  4,  8,  1,
   12,  5,  9,  2,
   13,  6, 10,  3,
   7, 14, 11, 15,
 };
 
-DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
   0,  1,  4,  2,
   5,  3,  6,  8,
   9,  7, 12, 10,
   13, 11, 14, 15,
 };
 
-DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]) = {
+DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]) = {
   0,  8,  1, 16,  9,  2, 17, 24,
   10,  3, 18, 25, 32, 11,  4, 26,
   33, 19, 40, 12, 34, 27,  5, 41,
@@ -82,7 +84,7 @@ DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]) = {
   46, 39, 61, 54, 47, 62, 55, 63,
 };
 
-DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
   0,  8, 16,  1, 24,  9, 32, 17,
   2, 40, 25, 10, 33, 18, 48,  3,
   26, 41, 11, 56, 19, 34,  4, 49,
@@ -93,7 +95,7 @@ DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = {
   31, 61, 39, 54, 47, 62, 55, 63,
 };
 
-DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
   0,  1,  2,  8,  9,  3, 16, 10,
   4, 17, 11, 24,  5, 18, 25, 12,
   19, 26, 32,  6, 13, 20, 33, 27,
@@ -104,7 +106,7 @@ DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = {
   60, 39, 61, 47, 54, 55, 62, 63,
 };
 
-DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
   0,  16,   1,  32,  17,   2,  48,  33,  18,   3,  64,  34,  49,  19,  65,  80,
   50,   4,  35,  66,  20,  81,  96,  51,   5,  36,  82,  97,  67, 112,  21,  52,
   98,  37,  83, 113,   6,  68, 128,  53,  22,  99, 114,  84,   7, 129,  38,  69,
@@ -123,7 +125,7 @@ DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]) = {
   190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255,
 };
 
-DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
   0,  16,  32,  48,   1,  64,  17,  80,  33,  96,  49,   2,  65, 112,  18,  81,
   34, 128,  50,  97,   3,  66, 144,  19, 113,  35,  82, 160,  98,  51, 129,   4,
   67, 176,  20, 114, 145,  83,  36,  99, 130,  52, 192,   5, 161,  68, 115,  21,
@@ -142,7 +144,7 @@ DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = {
   159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255,
 };
 
-DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
   0,   1,   2,  16,   3,  17,   4,  18,  32,   5,  33,  19,   6,  34,  48,  20,
   49,   7,  35,  21,  50,  64,   8,  36,  65,  22,  51,  37,  80,   9,  66,  52,
   23,  38,  81,  67,  10,  53,  24,  82,  68,  96,  39,  11,  54,  83,  97,  69,
@@ -161,7 +163,7 @@ DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = {
   190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255,
 };
 
-DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
   0,   32,    1,   64,   33,    2,   96,   65,   34,  128,    3,   97,   66,  160,  129,   35,   98,    4,   67,  130,  161,  192,   36,   99,  224,    5,  162,  193,   68,  131,   37,  100,
   225,  194,  256,  163,   69,  132,    6,  226,  257,  288,  195,  101,  164,   38,  258,    7,  227,  289,  133,  320,   70,  196,  165,  290,  259,  228,   39,  321,  102,  352,    8,  197,
   71,  134,  322,  291,  260,  353,  384,  229,  166,  103,   40,  354,  323,  292,  135,  385,  198,  261,   72,    9,  416,  167,  386,  355,  230,  324,  104,  293,   41,  417,  199,  136,
@@ -200,13 +202,8 @@ DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]) = {
 
 const vp9_tree_index vp9_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
 {
-#if CONFIG_BALANCED_COEFTREE
-  -ZERO_TOKEN, 2,                             /* 0 = ZERO */
-  -DCT_EOB_TOKEN, 4,                          /* 1 = EOB  */
-#else
   -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
   -ZERO_TOKEN, 4,                             /* 1 = ZERO */
-#endif
   -ONE_TOKEN, 6,                              /* 2 = ONE */
   8, 12,                                      /* 3 = LOW_VAL */
   -TWO_TOKEN, 10,                            /* 4 = TWO */
@@ -233,13 +230,8 @@ static const vp9_prob Pcat6[] = {
 };
 
 const vp9_tree_index vp9_coefmodel_tree[6] = {
-#if CONFIG_BALANCED_COEFTREE
-  -ZERO_TOKEN, 2,
-  -DCT_EOB_MODEL_TOKEN, 4,
-#else
   -DCT_EOB_MODEL_TOKEN, 2,                      /* 0 = EOB */
   -ZERO_TOKEN, 4,                               /* 1 = ZERO */
-#endif
   -ONE_TOKEN, -TWO_TOKEN,
 };
 
@@ -252,7 +244,7 @@ const vp9_tree_index vp9_coefmodel_tree[6] = {
 // the probabilities for the rest of the nodes.
 
 // beta = 8
-const vp9_prob vp9_modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
+static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
   {  3,  86, 128,   6,  86,  23,  88,  29},
   {  9,  86, 129,  17,  88,  61,  94,  76},
   { 15,  87, 129,  28,  89,  93, 100, 110},
@@ -386,8 +378,7 @@ const vp9_prob vp9_modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
 static void extend_model_to_full_distribution(vp9_prob p,
                                               vp9_prob *tree_probs) {
   const int l = ((p - 1) / 2);
-  const vp9_prob (*model)[MODEL_NODES];
-  model = vp9_modelcoefprobs_pareto8;
+  const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8;
   if (p & 1) {
     vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,
                model[l], MODEL_NODES * sizeof(vp9_prob));
@@ -406,16 +397,6 @@ void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
   extend_model_to_full_distribution(model[PIVOT_NODE], full);
 }
 
-void vp9_model_to_full_probs_sb(
-    vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES],
-    vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
-  int c, p;
-  for (c = 0; c < COEF_BANDS; ++c)
-    for (p = 0; p < PREV_COEF_CONTEXTS; ++p) {
-      vp9_model_to_full_probs(model[c][p], full[c][p]);
-    }
-}
-
 static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
 
 static void init_bit_tree(vp9_tree_index *p, int n) {
@@ -455,32 +436,6 @@ vp9_extra_bit vp9_extra_bits[12] = {
 
 #include "vp9/common/vp9_default_coef_probs.h"
 
-// This function updates and then returns n AC coefficient context
-// This is currently a placeholder function to allow experimentation
-// using various context models based on the energy earlier tokens
-// within the current block.
-//
-// For now it just returns the previously used context.
-#define MAX_NEIGHBORS 2
-int vp9_get_coef_context(const int *scan, const int *neighbors,
-                         int nb_pad, uint8_t *token_cache, int c, int l) {
-  int eob = l;
-  assert(nb_pad == MAX_NEIGHBORS);
-  if (c == eob) {
-    return 0;
-  } else {
-    int ctx;
-    assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0);
-    if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) {
-      ctx = (1 + token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]] +
-             token_cache[scan[neighbors[MAX_NEIGHBORS * c + 1]]]) >> 1;
-    } else {
-      ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]];
-    }
-    return ctx;
-  }
-};
-
 void vp9_default_coef_probs(VP9_COMMON *pc) {
   vpx_memcpy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4,
              sizeof(pc->fc.coef_probs[TX_4X4]));
@@ -496,28 +451,39 @@ void vp9_default_coef_probs(VP9_COMMON *pc) {
 // in {top, left, topleft, topright, bottomleft} order
 // for each position in raster scan order.
 // -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, int,
-                vp9_default_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_col_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_row_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_default_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_col_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_row_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_default_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_default_scan_32x32_neighbors[1024 * MAX_NEIGHBORS]);
-
-static int find_in_scan(const int *scan, int l, int idx) {
+DECLARE_ALIGNED(16, int16_t,
+                vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+
+static int find_in_scan(const int16_t *scan, int l, int idx) {
   int n, l2 = l * l;
   for (n = 0; n < l2; n++) {
     int rc = scan[n];
@@ -527,14 +493,19 @@ static int find_in_scan(const int *scan, int l, int idx) {
   assert(0);
   return -1;
 }
-static void init_scan_neighbors(const int *scan, int l, int *neighbors,
-                                int max_neighbors) {
+static void init_scan_neighbors(const int16_t *scan,
+                                int16_t *iscan,
+                                int l, int16_t *neighbors) {
   int l2 = l * l;
   int n, i, j;
 
-  for (n = 0; n < l2; n++) {
+  // dc doesn't use this type of prediction
+  neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
+  neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
+  iscan[0] = find_in_scan(scan, l, 0);
+  for (n = 1; n < l2; n++) {
     int rc = scan[n];
-    assert(max_neighbors == MAX_NEIGHBORS);
+    iscan[n] = find_in_scan(scan, l, n);
     i = rc / l;
     j = rc % l;
     if (i > 0 && j > 0) {
@@ -546,93 +517,84 @@ static void init_scan_neighbors(const int *scan, int l, int *neighbors,
       // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
       // as a context. If ADST or DCT is used in both directions, we
       // use the combination of the two as a context.
-      int a = find_in_scan(scan, l, (i - 1) * l + j);
-      int b = find_in_scan(scan, l,  i      * l + j - 1);
+      int a = (i - 1) * l + j;
+      int b =  i      * l + j - 1;
       if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
           scan == vp9_col_scan_16x16) {
-        neighbors[max_neighbors * n + 0] = a;
-        neighbors[max_neighbors * n + 1] = -1;
+        // in the col/row scan cases (as well as left/top edge cases), we set
+        // both contexts to the same value, so we can branchlessly do a+b+1>>1
+        // which automatically becomes a if a == b
+        neighbors[MAX_NEIGHBORS * n + 0] =
+        neighbors[MAX_NEIGHBORS * n + 1] = a;
       } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
                  scan == vp9_row_scan_16x16) {
-        neighbors[max_neighbors * n + 0] = b;
-        neighbors[max_neighbors * n + 1] = -1;
+        neighbors[MAX_NEIGHBORS * n + 0] =
+        neighbors[MAX_NEIGHBORS * n + 1] = b;
       } else {
-        neighbors[max_neighbors * n + 0] = a;
-        neighbors[max_neighbors * n + 1] = b;
+        neighbors[MAX_NEIGHBORS * n + 0] = a;
+        neighbors[MAX_NEIGHBORS * n + 1] = b;
       }
     } else if (i > 0) {
-      neighbors[max_neighbors * n + 0] = find_in_scan(scan, l, (i - 1) * l + j);
-      neighbors[max_neighbors * n + 1] = -1;
-    } else if (j > 0) {
-      neighbors[max_neighbors * n + 0] =
-          find_in_scan(scan, l,  i      * l + j - 1);
-      neighbors[max_neighbors * n + 1] = -1;
+      neighbors[MAX_NEIGHBORS * n + 0] =
+      neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
     } else {
-      assert(n == 0);
-      // dc predictor doesn't use previous tokens
-      neighbors[max_neighbors * n + 0] = -1;
+      assert(j > 0);
+      neighbors[MAX_NEIGHBORS * n + 0] =
+      neighbors[MAX_NEIGHBORS * n + 1] =  i      * l + j - 1;
     }
-    assert(neighbors[max_neighbors * n + 0] < n);
+    assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
   }
+  // one padding item so we don't have to add branches in code to handle
+  // calls to get_coef_context() for the token after the final dc token
+  neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
+  neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
 }
 
 void vp9_init_neighbors() {
-  init_scan_neighbors(vp9_default_scan_4x4, 4,
-                      vp9_default_scan_4x4_neighbors, MAX_NEIGHBORS);
-  init_scan_neighbors(vp9_row_scan_4x4, 4,
-                      vp9_row_scan_4x4_neighbors, MAX_NEIGHBORS);
-  init_scan_neighbors(vp9_col_scan_4x4, 4,
-                      vp9_col_scan_4x4_neighbors, MAX_NEIGHBORS);
-  init_scan_neighbors(vp9_default_scan_8x8, 8,
-                      vp9_default_scan_8x8_neighbors, MAX_NEIGHBORS);
-  init_scan_neighbors(vp9_row_scan_8x8, 8,
-                      vp9_row_scan_8x8_neighbors, MAX_NEIGHBORS);
-  init_scan_neighbors(vp9_col_scan_8x8, 8,
-                      vp9_col_scan_8x8_neighbors, MAX_NEIGHBORS);
-  init_scan_neighbors(vp9_default_scan_16x16, 16,
-                      vp9_default_scan_16x16_neighbors, MAX_NEIGHBORS);
-  init_scan_neighbors(vp9_row_scan_16x16, 16,
-                      vp9_row_scan_16x16_neighbors, MAX_NEIGHBORS);
-  init_scan_neighbors(vp9_col_scan_16x16, 16,
-                      vp9_col_scan_16x16_neighbors, MAX_NEIGHBORS);
-  init_scan_neighbors(vp9_default_scan_32x32, 32,
-                      vp9_default_scan_32x32_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
+                      vp9_default_scan_4x4_neighbors);
+  init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
+                      vp9_row_scan_4x4_neighbors);
+  init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
+                      vp9_col_scan_4x4_neighbors);
+  init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
+                      vp9_default_scan_8x8_neighbors);
+  init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
+                      vp9_row_scan_8x8_neighbors);
+  init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
+                      vp9_col_scan_8x8_neighbors);
+  init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
+                      vp9_default_scan_16x16_neighbors);
+  init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
+                      vp9_row_scan_16x16_neighbors);
+  init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
+                      vp9_col_scan_16x16_neighbors);
+  init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
+                      vp9_default_scan_32x32_neighbors);
 }
 
-const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad) {
+const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) {
   if (scan == vp9_default_scan_4x4) {
-    *pad = MAX_NEIGHBORS;
     return vp9_default_scan_4x4_neighbors;
   } else if (scan == vp9_row_scan_4x4) {
-    *pad = MAX_NEIGHBORS;
     return vp9_row_scan_4x4_neighbors;
   } else if (scan == vp9_col_scan_4x4) {
-    *pad = MAX_NEIGHBORS;
     return vp9_col_scan_4x4_neighbors;
   } else if (scan == vp9_default_scan_8x8) {
-    *pad = MAX_NEIGHBORS;
     return vp9_default_scan_8x8_neighbors;
   } else if (scan == vp9_row_scan_8x8) {
-    *pad = 2;
     return vp9_row_scan_8x8_neighbors;
   } else if (scan == vp9_col_scan_8x8) {
-    *pad = 2;
     return vp9_col_scan_8x8_neighbors;
   } else if (scan == vp9_default_scan_16x16) {
-    *pad = MAX_NEIGHBORS;
     return vp9_default_scan_16x16_neighbors;
   } else if (scan == vp9_row_scan_16x16) {
-    *pad = 2;
     return vp9_row_scan_16x16_neighbors;
   } else if (scan == vp9_col_scan_16x16) {
-    *pad = 2;
     return vp9_col_scan_16x16_neighbors;
-  } else if (scan == vp9_default_scan_32x32) {
-    *pad = MAX_NEIGHBORS;
-    return vp9_default_scan_32x32_neighbors;
   } else {
-    assert(0);
-    return NULL;
+    assert(scan == vp9_default_scan_32x32);
+    return vp9_default_scan_32x32_neighbors;
   }
 }
 
@@ -651,38 +613,15 @@ void vp9_coef_tree_initialize() {
 #define COEF_COUNT_SAT_AFTER_KEY 24
 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
 
-void vp9_full_to_model_count(unsigned int *model_count,
-                             unsigned int *full_count) {
-  int n;
-  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
-  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
-  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
-  for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
-    model_count[TWO_TOKEN] += full_count[n];
-  model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
-}
-
-void vp9_full_to_model_counts(
-    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
-  int i, j, k, l;
-  for (i = 0; i < BLOCK_TYPES; ++i)
-    for (j = 0; j < REF_TYPES; ++j)
-      for (k = 0; k < COEF_BANDS; ++k)
-        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          if (l >= 3 && k == 0)
-            continue;
-          vp9_full_to_model_count(model_count[i][j][k][l],
-                                  full_count[i][j][k][l]);
-        }
-}
-
 static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
                              int count_sat, int update_factor) {
+  FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+
   vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[txfm_size];
-  vp9_coeff_probs_model *pre_coef_probs = cm->fc.pre_coef_probs[txfm_size];
-  vp9_coeff_count_model *coef_counts = cm->fc.coef_counts[txfm_size];
+  vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[txfm_size];
+  vp9_coeff_count_model *coef_counts = cm->counts.coef[txfm_size];
   unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
-      cm->fc.eob_branch_counts[txfm_size];
+      cm->counts.eob_branch[txfm_size];
   int t, i, j, k, l, count;
   int factor;
   unsigned int branch_ct[UNCONSTRAINED_NODES][2];
@@ -699,13 +638,8 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
               vp9_coefmodel_tree,
               coef_probs, branch_ct,
               coef_counts[i][j][k][l], 0);
-#if CONFIG_BALANCED_COEFTREE
-          branch_ct[1][1] = eob_branch_count[i][j][k][l] - branch_ct[1][0];
-          coef_probs[1] = get_binary_prob(branch_ct[1][0], branch_ct[1][1]);
-#else
           branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
           coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-#endif
           for (t = 0; t < entropy_nodes_adapt; ++t) {
             count = branch_ct[t][0] + branch_ct[t][1];
             count = count > count_sat ? count_sat : count;
diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h
index 7f2bf3d..4ea727f 100644
--- a/libvpx/vp9/common/vp9_entropy.h
+++ b/libvpx/vp9/common/vp9_entropy.h
@@ -52,8 +52,6 @@ typedef struct {
 
 extern vp9_extra_bit vp9_extra_bits[12];    /* indexed by token value */
 
-#define PROB_UPDATE_BASELINE_COST   7
-
 #define MAX_PROB                255
 #define DCT_MAX_VALUE           16384
 
@@ -99,22 +97,62 @@ typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
 
 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *);
-extern DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
+
+extern DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
 
-extern DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
 
-extern DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
 
-extern DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]);
+extern DECLARE_ALIGNED(64, int16_t, vp9_default_iscan_8x8[64]);
 
-extern DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
 
-extern DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
 
-extern DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+
+#define MAX_NEIGHBORS 2
+
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
 
 void vp9_coef_tree_initialize(void);
 void vp9_adapt_coef_probs(struct VP9Common *);
@@ -148,9 +186,14 @@ static int get_coef_band(const uint8_t * band_translate, int coef_index) {
     ? (COEF_BANDS-1) : band_translate[coef_index];
 }
 
-extern int vp9_get_coef_context(const int *scan, const int *neighbors,
-                                int nb_pad, uint8_t *token_cache, int c, int l);
-const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);
+static INLINE int get_coef_context(const int16_t *neighbors,
+                                   uint8_t *token_cache,
+                                   int c) {
+  return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
+          token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
+}
+
+const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan);
 
 
 // 128 lists of probabilities are stored for the following ONE node probs:
@@ -160,7 +203,6 @@ const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);
 #define COEFPROB_MODELS             128
 
 #define UNCONSTRAINED_NODES         3
-#define MODEL_NODES                 (ENTROPY_NODES - UNCONSTRAINED_NODES)
 
 #define PIVOT_NODE                  2   // which node is pivot
 
@@ -174,20 +216,10 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
 typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS]
                                           [PREV_COEF_CONTEXTS]
                                           [UNCONSTRAINED_NODES][2];
-extern void vp9_full_to_model_count(unsigned int *model_count,
-                                    unsigned int *full_count);
-extern void vp9_full_to_model_counts(
-    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count);
 
 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
 
-void vp9_model_to_full_probs_sb(
-    vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES],
-    vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]);
-
-extern const vp9_prob vp9_modelcoefprobs[COEFPROB_MODELS][ENTROPY_NODES - 1];
-
-static INLINE const int* get_scan_4x4(TX_TYPE tx_type) {
+static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
   switch (tx_type) {
     case ADST_DCT:
       return vp9_row_scan_4x4;
@@ -198,7 +230,36 @@ static INLINE const int* get_scan_4x4(TX_TYPE tx_type) {
   }
 }
 
-static INLINE const int* get_scan_8x8(TX_TYPE tx_type) {
+static INLINE void get_scan_nb_4x4(TX_TYPE tx_type,
+                                   const int16_t **scan, const int16_t **nb) {
+  switch (tx_type) {
+    case ADST_DCT:
+      *scan = vp9_row_scan_4x4;
+      *nb = vp9_row_scan_4x4_neighbors;
+      break;
+    case DCT_ADST:
+      *scan = vp9_col_scan_4x4;
+      *nb = vp9_col_scan_4x4_neighbors;
+      break;
+    default:
+      *scan = vp9_default_scan_4x4;
+      *nb = vp9_default_scan_4x4_neighbors;
+      break;
+  }
+}
+
+static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_iscan_4x4;
+    case DCT_ADST:
+      return vp9_col_iscan_4x4;
+    default:
+      return vp9_default_iscan_4x4;
+  }
+}
+
+static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
   switch (tx_type) {
     case ADST_DCT:
       return vp9_row_scan_8x8;
@@ -209,7 +270,36 @@ static INLINE const int* get_scan_8x8(TX_TYPE tx_type) {
   }
 }
 
-static INLINE const int* get_scan_16x16(TX_TYPE tx_type) {
+static INLINE void get_scan_nb_8x8(TX_TYPE tx_type,
+                                   const int16_t **scan, const int16_t **nb) {
+  switch (tx_type) {
+    case ADST_DCT:
+      *scan = vp9_row_scan_8x8;
+      *nb = vp9_row_scan_8x8_neighbors;
+      break;
+    case DCT_ADST:
+      *scan = vp9_col_scan_8x8;
+      *nb = vp9_col_scan_8x8_neighbors;
+      break;
+    default:
+      *scan = vp9_default_scan_8x8;
+      *nb = vp9_default_scan_8x8_neighbors;
+      break;
+  }
+}
+
+static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_iscan_8x8;
+    case DCT_ADST:
+      return vp9_col_iscan_8x8;
+    default:
+      return vp9_default_iscan_8x8;
+  }
+}
+
+static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
   switch (tx_type) {
     case ADST_DCT:
       return vp9_row_scan_16x16;
@@ -220,6 +310,35 @@ static INLINE const int* get_scan_16x16(TX_TYPE tx_type) {
   }
 }
 
+static INLINE void get_scan_nb_16x16(TX_TYPE tx_type,
+                                     const int16_t **scan, const int16_t **nb) {
+  switch (tx_type) {
+    case ADST_DCT:
+      *scan = vp9_row_scan_16x16;
+      *nb = vp9_row_scan_16x16_neighbors;
+      break;
+    case DCT_ADST:
+      *scan = vp9_col_scan_16x16;
+      *nb = vp9_col_scan_16x16_neighbors;
+      break;
+    default:
+      *scan = vp9_default_scan_16x16;
+      *nb = vp9_default_scan_16x16_neighbors;
+      break;
+  }
+}
+
+static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_iscan_16x16;
+    case DCT_ADST:
+      return vp9_col_iscan_16x16;
+    default:
+      return vp9_default_iscan_16x16;
+  }
+}
+
 enum { VP9_COEF_UPDATE_PROB = 252 };
 
 #endif  // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c
index 3302814..ca188e4 100644
--- a/libvpx/vp9/common/vp9_entropymode.c
+++ b/libvpx/vp9/common/vp9_entropymode.c
@@ -8,15 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "vpx_mem/vpx_mem.h"
 
+#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_modecont.h"
 #include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_alloccommon.h"
-#include "vpx_mem/vpx_mem.h"
 
-static const vp9_prob default_kf_uv_probs[VP9_INTRA_MODES]
-                                         [VP9_INTRA_MODES - 1] = {
+const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES]
+                                  [VP9_INTRA_MODES - 1] = {
   { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,
   { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,
   { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,
@@ -51,8 +50,9 @@ static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES]
   { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */
 };
 
-const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
-                                  [PARTITION_TYPES - 1] = {
+static const vp9_prob default_partition_probs[NUM_FRAME_TYPES]
+                                             [NUM_PARTITION_CONTEXTS]
+                                             [PARTITION_TYPES - 1] = {
   { /* frame_type = keyframe */
     /* 8x8 -> 4x4 */
     { 158,  97,  94 } /* a/l both not split */,
@@ -98,6 +98,133 @@ const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
   }
 };
 
+const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
+                                 [VP9_INTRA_MODES]
+                                 [VP9_INTRA_MODES - 1] = {
+  { /* above = dc */
+    { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
+    {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
+    {  73,  32,  19, 187, 222, 215,  46,  34, 100 } /* left = h */,
+    {  91,  30,  32, 116, 121, 186,  93,  86,  94 } /* left = d45 */,
+    {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
+    {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
+    {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
+    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
+    {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
+    {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
+  }, { /* above = v */
+    {  63,  36, 126, 146, 123, 158,  60,  90,  96 } /* left = dc */,
+    {  43,  46, 168, 134, 107, 128,  69, 142,  92 } /* left = v */,
+    {  44,  29,  68, 159, 201, 177,  50,  57,  77 } /* left = h */,
+    {  58,  38,  76, 114,  97, 172,  78, 133,  92 } /* left = d45 */,
+    {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
+    {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
+    {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
+    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
+    {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
+    {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
+  }, { /* above = h */
+    {  82,  26,  26, 171, 208, 204,  44,  32, 105 } /* left = dc */,
+    {  55,  44,  68, 166, 179, 192,  57,  57, 108 } /* left = v */,
+    {  42,  26,  11, 199, 241, 228,  23,  15,  85 } /* left = h */,
+    {  68,  42,  19, 131, 160, 199,  55,  52,  83 } /* left = d45 */,
+    {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
+    {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
+    {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
+    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
+    {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
+    {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
+  }, { /* above = d45 */
+    { 103,  26,  36, 129, 132, 201,  83,  80,  93 } /* left = dc */,
+    {  59,  38,  83, 112, 103, 162,  98, 136,  90 } /* left = v */,
+    {  62,  30,  23, 158, 200, 207,  59,  57,  50 } /* left = h */,
+    {  67,  30,  29,  84,  86, 191, 102,  91,  59 } /* left = d45 */,
+    {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
+    {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
+    {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
+    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
+    {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
+    {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
+  }, { /* above = d135 */
+    {  69,  23,  29, 128,  83, 199,  46,  44, 101 } /* left = dc */,
+    {  53,  40,  55, 139,  69, 183,  61,  80, 110 } /* left = v */,
+    {  40,  29,  19, 161, 180, 207,  43,  24,  91 } /* left = h */,
+    {  60,  34,  19, 105,  61, 198,  53,  64,  89 } /* left = d45 */,
+    {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
+    {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
+    {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
+    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
+    {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
+    {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
+  }, { /* above = d117 */
+    {  64,  19,  37, 156,  66, 138,  49,  95, 133 } /* left = dc */,
+    {  46,  27,  80, 150,  55, 124,  55, 121, 135 } /* left = v */,
+    {  36,  23,  27, 165, 149, 166,  54,  64, 118 } /* left = h */,
+    {  53,  21,  36, 131,  63, 163,  60, 109,  81 } /* left = d45 */,
+    {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
+    {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
+    {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
+    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
+    {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
+    {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
+  }, { /* above = d153 */
+    {  75,  17,  22, 136, 138, 185,  32,  34, 166 } /* left = dc */,
+    {  56,  39,  58, 133, 117, 173,  48,  53, 187 } /* left = v */,
+    {  35,  21,  12, 161, 212, 207,  20,  23, 145 } /* left = h */,
+    {  56,  29,  19, 117, 109, 181,  55,  68, 112 } /* left = d45 */,
+    {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
+    {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
+    {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
+    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
+    {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
+    {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
+  }, { /* above = d27 */
+    {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
+    {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
+    {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
+    {  68,  36,  17, 106, 102, 206,  59,  74,  74 } /* left = d45 */,
+    {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
+    {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
+    {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
+    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
+    {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
+    {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
+  }, { /* above = d63 */
+    {  78,  23,  39, 111, 117, 170,  74, 124,  94 } /* left = dc */,
+    {  48,  34,  86, 101,  92, 146,  78, 179, 134 } /* left = v */,
+    {  47,  22,  24, 138, 187, 178,  68,  69,  59 } /* left = h */,
+    {  56,  25,  33, 105, 112, 187,  95, 177, 129 } /* left = d45 */,
+    {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
+    {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
+    {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
+    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
+    {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
+    {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
+  }, { /* above = tm */
+    {  65,  70,  60, 155, 159, 199,  61,  60,  81 } /* left = dc */,
+    {  44,  78, 115, 132, 119, 173,  71, 112,  93 } /* left = v */,
+    {  39,  38,  21, 184, 227, 206,  42,  32,  64 } /* left = h */,
+    {  58,  47,  36, 124, 137, 193,  80,  82,  78 } /* left = d45 */,
+    {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
+    {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
+    {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
+    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
+    {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
+    {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
+  }
+};
+
+static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
+                                              [VP9_INTER_MODES - 1] = {
+  {2,       173,   34},  // 0 = both zero mv
+  {7,       145,   85},  // 1 = one zero mv + one a predicted mv
+  {7,       166,   63},  // 2 = two predicted mvs
+  {7,       94,    66},  // 3 = one predicted/zero and one new mv
+  {8,       64,    46},  // 4 = two new mvs
+  {17,      81,    31},  // 5 = one intra neighbour + x
+  {25,      29,    30},  // 6 = two intra neighbours
+};
+
 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
 const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
   -DC_PRED, 2,                      /* 0 = DC_NODE */
@@ -111,7 +238,7 @@ const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
   -D153_PRED, -D27_PRED             /* 8 = D153_NODE */
 };
 
-const vp9_tree_index vp9_sb_mv_ref_tree[6] = {
+const vp9_tree_index vp9_inter_mode_tree[6] = {
   -ZEROMV, 2,
   -NEARESTMV, 4,
   -NEARMV, -NEWMV
@@ -124,8 +251,7 @@ const vp9_tree_index vp9_partition_tree[6] = {
 };
 
 struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
-
-struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES];
+struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];
 
 struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
@@ -149,20 +275,15 @@ static const vp9_prob default_single_ref_p[REF_CONTEXTS][2] = {
   { 238, 247 }
 };
 
-const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS]
-                                          [TX_SIZE_MAX_SB - 1] = {
-  { 3, 136, 37, },
-  { 5, 52, 13, },
-};
-const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS]
-                                          [TX_SIZE_MAX_SB - 2] = {
-  { 20, 152, },
-  { 15, 101, },
-};
-const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS]
-                                        [TX_SIZE_MAX_SB - 3] = {
-  { 100, },
-  { 66, },
+static const struct tx_probs default_tx_probs = {
+  { { 3, 136, 37 },
+    { 5, 52,  13 } },
+
+  { { 20, 152 },
+    { 15, 101 } },
+
+  { { 100 },
+    { 66  } }
 };
 
 void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
@@ -181,52 +302,40 @@ void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
 void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
                                       unsigned int (*ct_16x16p)[2]) {
   ct_16x16p[0][0] = tx_count_16x16p[TX_4X4];
-  ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] +
-                    tx_count_16x16p[TX_16X16];
+  ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16];
   ct_16x16p[1][0] = tx_count_16x16p[TX_8X8];
   ct_16x16p[1][1] = tx_count_16x16p[TX_16X16];
 }
 
 void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
                                     unsigned int (*ct_8x8p)[2]) {
-  ct_8x8p[0][0] =   tx_count_8x8p[TX_4X4];
-  ct_8x8p[0][1] =   tx_count_8x8p[TX_8X8];
+  ct_8x8p[0][0] = tx_count_8x8p[TX_4X4];
+  ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
 }
 
-const vp9_prob vp9_default_mbskip_probs[MBSKIP_CONTEXTS] = {
+static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = {
   192, 128, 64
 };
 
-void vp9_init_mbmode_probs(VP9_COMMON *x) {
-  vpx_memcpy(x->fc.uv_mode_prob, default_if_uv_probs,
-             sizeof(default_if_uv_probs));
-  vpx_memcpy(x->kf_uv_mode_prob, default_kf_uv_probs,
-             sizeof(default_kf_uv_probs));
-  vpx_memcpy(x->fc.y_mode_prob, default_if_y_probs,
-             sizeof(default_if_y_probs));
-
-  vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,
-             sizeof(vp9_switchable_interp_prob));
-
-  vpx_memcpy(x->fc.partition_prob, vp9_partition_probs,
-             sizeof(vp9_partition_probs));
-
-  vpx_memcpy(x->fc.intra_inter_prob, default_intra_inter_p,
-             sizeof(default_intra_inter_p));
-  vpx_memcpy(x->fc.comp_inter_prob, default_comp_inter_p,
-             sizeof(default_comp_inter_p));
-  vpx_memcpy(x->fc.comp_ref_prob, default_comp_ref_p,
-             sizeof(default_comp_ref_p));
-  vpx_memcpy(x->fc.single_ref_prob, default_single_ref_p,
-             sizeof(default_single_ref_p));
-  vpx_memcpy(x->fc.tx_probs_32x32p, vp9_default_tx_probs_32x32p,
-             sizeof(vp9_default_tx_probs_32x32p));
-  vpx_memcpy(x->fc.tx_probs_16x16p, vp9_default_tx_probs_16x16p,
-             sizeof(vp9_default_tx_probs_16x16p));
-  vpx_memcpy(x->fc.tx_probs_8x8p, vp9_default_tx_probs_8x8p,
-             sizeof(vp9_default_tx_probs_8x8p));
-  vpx_memcpy(x->fc.mbskip_probs, vp9_default_mbskip_probs,
-             sizeof(vp9_default_mbskip_probs));
+static const vp9_prob default_switchable_interp_prob[VP9_SWITCHABLE_FILTERS+1]
+                                                  [VP9_SWITCHABLE_FILTERS-1] = {
+  { 235, 162, },
+  { 36, 255, },
+  { 34, 3, },
+  { 149, 144, },
+};
+
+void vp9_init_mbmode_probs(VP9_COMMON *cm) {
+  vp9_copy(cm->fc.uv_mode_prob, default_if_uv_probs);
+  vp9_copy(cm->fc.y_mode_prob, default_if_y_probs);
+  vp9_copy(cm->fc.switchable_interp_prob, default_switchable_interp_prob);
+  vp9_copy(cm->fc.partition_prob, default_partition_probs);
+  vp9_copy(cm->fc.intra_inter_prob, default_intra_inter_p);
+  vp9_copy(cm->fc.comp_inter_prob, default_comp_inter_p);
+  vp9_copy(cm->fc.comp_ref_prob, default_comp_ref_p);
+  vp9_copy(cm->fc.single_ref_prob, default_single_ref_p);
+  cm->fc.tx_probs = default_tx_probs;
+  vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
 }
 
 const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
@@ -236,40 +345,22 @@ const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
 struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
 const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
   EIGHTTAP, EIGHTTAP_SMOOTH, EIGHTTAP_SHARP};
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, 0, 2, -1, -1};
-const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
-                                          [VP9_SWITCHABLE_FILTERS-1] = {
-  { 235, 162, },
-  { 36, 255, },
-  { 34, 3, },
-  { 149, 144, },
-};
-
-// Indicates if the filter is interpolating or non-interpolating
-const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {1, 1, 1, 1, -1};
+const int vp9_switchable_interp_map[SWITCHABLE + 1] = {1, 0, 2, -1, -1};
 
 void vp9_entropy_mode_init() {
   vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
   vp9_tokens_from_tree(vp9_switchable_interp_encodings,
                        vp9_switchable_interp_tree);
   vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
-
-  vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,
-                              vp9_sb_mv_ref_tree, NEARESTMV);
-}
-
-void vp9_init_mode_contexts(VP9_COMMON *pc) {
-  vpx_memset(pc->fc.inter_mode_counts, 0, sizeof(pc->fc.inter_mode_counts));
-  vpx_memcpy(pc->fc.inter_mode_probs,
-             vp9_default_inter_mode_probs,
-             sizeof(vp9_default_inter_mode_probs));
+  vp9_tokens_from_tree_offset(vp9_inter_mode_encodings,
+                              vp9_inter_mode_tree, NEARESTMV);
 }
 
 void vp9_accum_mv_refs(VP9_COMMON *pc,
                        MB_PREDICTION_MODE m,
                        const int context) {
   unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
-      pc->fc.inter_mode_counts;
+      pc->counts.inter_mode;
 
   if (m == ZEROMV) {
     ++inter_mode_counts[context][0][0];
@@ -288,39 +379,32 @@ void vp9_accum_mv_refs(VP9_COMMON *pc,
   }
 }
 
-#define MVREF_COUNT_SAT 20
-#define MVREF_MAX_UPDATE_FACTOR 128
-void vp9_adapt_mode_context(VP9_COMMON *pc) {
-  int i, j;
-  unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
-      pc->fc.inter_mode_counts;
-  vp9_prob (*mode_context)[VP9_INTER_MODES - 1] = pc->fc.inter_mode_probs;
-
-  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
-    for (i = 0; i < VP9_INTER_MODES - 1; i++) {
-      int count = inter_mode_counts[j][i][0] + inter_mode_counts[j][i][1];
-      int factor;
-      count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;
-      factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);
-      mode_context[j][i] = weighted_prob(
-          pc->fc.pre_inter_mode_probs[j][i],
-          get_binary_prob(inter_mode_counts[j][i][0],
-                          inter_mode_counts[j][i][1]),
-          factor);
-    }
-  }
-}
+#define COUNT_SAT 20
+#define MAX_UPDATE_FACTOR 128
 
-#define MODE_COUNT_SAT 20
-#define MODE_MAX_UPDATE_FACTOR 128
-static int update_mode_ct(vp9_prob pre_prob, vp9_prob prob,
-                          unsigned int branch_ct[2]) {
-  int factor, count = branch_ct[0] + branch_ct[1];
-  count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-  factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+static int update_ct(vp9_prob pre_prob, vp9_prob prob,
+                          unsigned int ct[2]) {
+  const int count = MIN(ct[0] + ct[1], COUNT_SAT);
+  const int factor = MAX_UPDATE_FACTOR * count / COUNT_SAT;
   return weighted_prob(pre_prob, prob, factor);
 }
 
+static int update_ct2(vp9_prob pre_prob, unsigned int ct[2]) {
+  return update_ct(pre_prob, get_binary_prob(ct[0], ct[1]), ct);
+}
+
+void vp9_adapt_mode_context(VP9_COMMON *pc) {
+  int i, j;
+  FRAME_CONTEXT *const fc = &pc->fc;
+  FRAME_CONTEXT *const pre_fc = &pc->frame_contexts[pc->frame_context_idx];
+  FRAME_COUNTS  *const counts = &pc->counts;
+
+  for (j = 0; j < INTER_MODE_CONTEXTS; j++)
+    for (i = 0; i < VP9_INTER_MODES - 1; i++)
+      fc->inter_mode_probs[j][i] = update_ct2(pre_fc->inter_mode_probs[j][i],
+                                              counts->inter_mode[j][i]);
+}
+
 static void update_mode_probs(int n_modes,
                               const vp9_tree_index *tree, unsigned int *cnt,
                               vp9_prob *pre_probs, vp9_prob *dst_probs,
@@ -333,189 +417,127 @@ static void update_mode_probs(int n_modes,
   assert(n_modes - 1 < MAX_PROBS);
   vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset);
   for (t = 0; t < n_modes - 1; ++t)
-    dst_probs[t] = update_mode_ct(pre_probs[t], probs[t], branch_ct[t]);
-}
-
-static int update_mode_ct2(vp9_prob pre_prob, unsigned int branch_ct[2]) {
-  return update_mode_ct(pre_prob, get_binary_prob(branch_ct[0],
-                                                  branch_ct[1]), branch_ct);
+    dst_probs[t] = update_ct(pre_probs[t], probs[t], branch_ct[t]);
 }
 
-// #define MODE_COUNT_TESTING
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   int i, j;
   FRAME_CONTEXT *fc = &cm->fc;
-#ifdef MODE_COUNT_TESTING
-  int t;
-
-  printf("static const unsigned int\nymode_counts"
-         "[VP9_INTRA_MODES] = {\n");
-  for (t = 0; t < VP9_INTRA_MODES; ++t)
-    printf("%d, ", fc->ymode_counts[t]);
-  printf("};\n");
-  printf("static const unsigned int\nuv_mode_counts"
-         "[VP9_INTRA_MODES] [VP9_INTRA_MODES] = {\n");
-  for (i = 0; i < VP9_INTRA_MODES; ++i) {
-    printf("  {");
-    for (t = 0; t < VP9_INTRA_MODES; ++t)
-      printf("%d, ", fc->uv_mode_counts[i][t]);
-    printf("},\n");
-  }
-  printf("};\n");
-  printf("static const unsigned int\nbmode_counts"
-         "[VP9_NKF_BINTRAMODES] = {\n");
-  for (t = 0; t < VP9_NKF_BINTRAMODES; ++t)
-    printf("%d, ", fc->bmode_counts[t]);
-  printf("};\n");
-  printf("static const unsigned int\ni8x8_mode_counts"
-         "[VP9_I8X8_MODES] = {\n");
-  for (t = 0; t < VP9_I8X8_MODES; ++t)
-    printf("%d, ", fc->i8x8_mode_counts[t]);
-  printf("};\n");
-  printf("static const unsigned int\nmbsplit_counts"
-         "[VP9_NUMMBSPLITS] = {\n");
-  for (t = 0; t < VP9_NUMMBSPLITS; ++t)
-    printf("%d, ", fc->mbsplit_counts[t]);
-  printf("};\n");
-#endif
+  FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  FRAME_COUNTS *counts = &cm->counts;
 
   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    fc->intra_inter_prob[i] = update_mode_ct2(fc->pre_intra_inter_prob[i],
-                                              fc->intra_inter_count[i]);
+    fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i],
+                                         counts->intra_inter[i]);
   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    fc->comp_inter_prob[i] = update_mode_ct2(fc->pre_comp_inter_prob[i],
-                                             fc->comp_inter_count[i]);
+    fc->comp_inter_prob[i] = update_ct2(pre_fc->comp_inter_prob[i],
+                                        counts->comp_inter[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
-    fc->comp_ref_prob[i] = update_mode_ct2(fc->pre_comp_ref_prob[i],
-                                           fc->comp_ref_count[i]);
+    fc->comp_ref_prob[i] = update_ct2(pre_fc->comp_ref_prob[i],
+                                      counts->comp_ref[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      fc->single_ref_prob[i][j] = update_mode_ct2(fc->pre_single_ref_prob[i][j],
-                                                  fc->single_ref_count[i][j]);
+      fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j],
+                                             counts->single_ref[i][j]);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
     update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
-                      fc->y_mode_counts[i], fc->pre_y_mode_prob[i],
+                      counts->y_mode[i], pre_fc->y_mode_prob[i],
                       fc->y_mode_prob[i], 0);
 
   for (i = 0; i < VP9_INTRA_MODES; ++i)
     update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
-                      fc->uv_mode_counts[i], fc->pre_uv_mode_prob[i],
+                      counts->uv_mode[i], pre_fc->uv_mode_prob[i],
                       fc->uv_mode_prob[i], 0);
 
   for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
     update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
-                      fc->partition_counts[i], fc->pre_partition_prob[i],
+                      counts->partition[i],
+                      pre_fc->partition_prob[INTER_FRAME][i],
                       fc->partition_prob[INTER_FRAME][i], 0);
 
   if (cm->mcomp_filter_type == SWITCHABLE) {
-    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
       update_mode_probs(VP9_SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
-                        fc->switchable_interp_count[i],
-                        fc->pre_switchable_interp_prob[i],
+                        counts->switchable_interp[i],
+                        pre_fc->switchable_interp_prob[i],
                         fc->switchable_interp_prob[i], 0);
-    }
   }
-  if (cm->txfm_mode == TX_MODE_SELECT) {
+
+  if (cm->tx_mode == TX_MODE_SELECT) {
     int j;
     unsigned int branch_ct_8x8p[TX_SIZE_MAX_SB - 3][2];
     unsigned int branch_ct_16x16p[TX_SIZE_MAX_SB - 2][2];
     unsigned int branch_ct_32x32p[TX_SIZE_MAX_SB - 1][2];
+
     for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-      tx_counts_to_branch_counts_8x8(cm->fc.tx_count_8x8p[i],
-                                     branch_ct_8x8p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) {
-        int factor;
-        int count = branch_ct_8x8p[j][0] + branch_ct_8x8p[j][1];
-        vp9_prob prob = get_binary_prob(branch_ct_8x8p[j][0],
-                                        branch_ct_8x8p[j][1]);
-        count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-        factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-        cm->fc.tx_probs_8x8p[i][j] = weighted_prob(
-            cm->fc.pre_tx_probs_8x8p[i][j], prob, factor);
-      }
-    }
-    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-      tx_counts_to_branch_counts_16x16(cm->fc.tx_count_16x16p[i],
+      tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
+      for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j)
+        fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
+                                             branch_ct_8x8p[j]);
+
+      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i],
                                        branch_ct_16x16p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) {
-        int factor;
-        int count = branch_ct_16x16p[j][0] + branch_ct_16x16p[j][1];
-        vp9_prob prob = get_binary_prob(branch_ct_16x16p[j][0],
-                                        branch_ct_16x16p[j][1]);
-        count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-        factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-        cm->fc.tx_probs_16x16p[i][j] = weighted_prob(
-            cm->fc.pre_tx_probs_16x16p[i][j], prob, factor);
-      }
-    }
-    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-      tx_counts_to_branch_counts_32x32(cm->fc.tx_count_32x32p[i],
+      for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j)
+        fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
+                                               branch_ct_16x16p[j]);
+
+      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i],
                                        branch_ct_32x32p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) {
-        int factor;
-        int count = branch_ct_32x32p[j][0] + branch_ct_32x32p[j][1];
-        vp9_prob prob = get_binary_prob(branch_ct_32x32p[j][0],
-                                        branch_ct_32x32p[j][1]);
-        count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-        factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-        cm->fc.tx_probs_32x32p[i][j] = weighted_prob(
-            cm->fc.pre_tx_probs_32x32p[i][j], prob, factor);
-      }
+      for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j)
+        fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
+                                               branch_ct_32x32p[j]);
     }
   }
+
   for (i = 0; i < MBSKIP_CONTEXTS; ++i)
-    fc->mbskip_probs[i] = update_mode_ct2(fc->pre_mbskip_probs[i],
-                                          fc->mbskip_count[i]);
+    fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i],
+                                     counts->mbskip[i]);
 }
 
 static void set_default_lf_deltas(MACROBLOCKD *xd) {
-  xd->mode_ref_lf_delta_enabled = 1;
-  xd->mode_ref_lf_delta_update = 1;
+  xd->lf.mode_ref_delta_enabled = 1;
+  xd->lf.mode_ref_delta_update = 1;
 
-  xd->ref_lf_deltas[INTRA_FRAME] = 1;
-  xd->ref_lf_deltas[LAST_FRAME] = 0;
-  xd->ref_lf_deltas[GOLDEN_FRAME] = -1;
-  xd->ref_lf_deltas[ALTREF_FRAME] = -1;
+  xd->lf.ref_deltas[INTRA_FRAME] = 1;
+  xd->lf.ref_deltas[LAST_FRAME] = 0;
+  xd->lf.ref_deltas[GOLDEN_FRAME] = -1;
+  xd->lf.ref_deltas[ALTREF_FRAME] = -1;
 
-  xd->mode_lf_deltas[0] = 0;              // Zero
-  xd->mode_lf_deltas[1] = 0;               // New mv
+  xd->lf.mode_deltas[0] = 0;
+  xd->lf.mode_deltas[1] = 0;
 }
 
 void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
   // Reset the segment feature data to the default stats:
   // Features disabled, 0, with delta coding (Default state).
   int i;
-  vp9_clearall_segfeatures(xd);
-  xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+  vp9_clearall_segfeatures(&xd->seg);
+  xd->seg.abs_delta = SEGMENT_DELTADATA;
   if (cm->last_frame_seg_map)
     vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
   // Reset the mode ref deltas for loop filter
-  vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->last_ref_lf_deltas));
-  vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->last_mode_lf_deltas));
+  vp9_zero(xd->lf.last_ref_deltas);
+  vp9_zero(xd->lf.last_mode_deltas);
   set_default_lf_deltas(xd);
 
+  // To force update of the sharpness
+  xd->lf.last_sharpness_level = -1;
+
   vp9_default_coef_probs(cm);
   vp9_init_mbmode_probs(cm);
-  vpx_memcpy(cm->kf_y_mode_prob, vp9_kf_default_bmode_probs,
-             sizeof(vp9_kf_default_bmode_probs));
   vp9_init_mv_probs(cm);
+  vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs);
 
-  // To force update of the sharpness
-  cm->last_sharpness_level = -1;
-
-  vp9_init_mode_contexts(cm);
-
-  if ((cm->frame_type == KEY_FRAME) ||
-      cm->error_resilient_mode || (cm->reset_frame_context == 3)) {
+  if (cm->frame_type == KEY_FRAME ||
+      cm->error_resilient_mode || cm->reset_frame_context == 3) {
     // Reset all frame contexts.
     for (i = 0; i < NUM_FRAME_CONTEXTS; ++i)
-      vpx_memcpy(&cm->frame_contexts[i], &cm->fc, sizeof(cm->fc));
+      cm->frame_contexts[i] = cm->fc;
   } else if (cm->reset_frame_context == 2) {
     // Reset only the frame context specified in the frame header.
-    vpx_memcpy(&cm->frame_contexts[cm->frame_context_idx], &cm->fc,
-               sizeof(cm->fc));
+    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
   }
 
   vpx_memset(cm->prev_mip, 0,
@@ -529,7 +551,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
   vp9_update_mode_info_border(cm, cm->prev_mip);
   vp9_update_mode_info_in_image(cm, cm->prev_mi);
 
-  vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
+  vp9_zero(cm->ref_frame_sign_bias);
 
   cm->frame_context_idx = 0;
 }
diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h
index aa8aec7..8c14e7e 100644
--- a/libvpx/vp9/common/vp9_entropymode.h
+++ b/libvpx/vp9/common/vp9_entropymode.h
@@ -16,81 +16,68 @@
 
 #define SUBMVREF_COUNT 5
 #define TX_SIZE_CONTEXTS 2
-
 #define VP9_MODE_UPDATE_PROB  252
+#define VP9_SWITCHABLE_FILTERS 3   // number of switchable filters
 
 // #define MODE_STATS
 
-extern int vp9_mv_cont(const int_mv *l, const int_mv *a);
+struct VP9Common;
+
+struct tx_probs {
+  vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
+  vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+  vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
+};
 
+struct tx_counts {
+  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];
+  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
+  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+};
 
-extern const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES]
-                                                [VP9_INTRA_MODES]
-                                                [VP9_INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES]
+                                        [VP9_INTRA_MODES - 1];
 
 extern const vp9_tree_index vp9_intra_mode_tree[];
-extern const vp9_tree_index  vp9_sb_mv_ref_tree[];
+extern const vp9_tree_index vp9_inter_mode_tree[];
 
 extern struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
-
-/* Inter mode values do not start at zero */
-
-extern struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES];
+extern struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];
 
 // probability models for partition information
-extern const vp9_tree_index  vp9_partition_tree[];
+extern const vp9_tree_index vp9_partition_tree[];
 extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
-extern const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES]
-                                         [NUM_PARTITION_CONTEXTS]
-                                         [PARTITION_TYPES - 1];
-
-void vp9_entropy_mode_init(void);
-
-struct VP9Common;
 
-/* sets up common features to forget past dependence */
-void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);
+extern const INTERPOLATIONFILTERTYPE vp9_switchable_interp
+                 [VP9_SWITCHABLE_FILTERS];
 
-void vp9_init_mbmode_probs(struct VP9Common *x);
+extern const int vp9_switchable_interp_map[SWITCHABLE + 1];
 
-extern void vp9_init_mode_contexts(struct VP9Common *pc);
+extern const vp9_tree_index vp9_switchable_interp_tree
+                 [2 * (VP9_SWITCHABLE_FILTERS - 1)];
 
-extern void vp9_adapt_mode_context(struct VP9Common *pc);
+extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
 
-extern void vp9_accum_mv_refs(struct VP9Common *pc,
-                              MB_PREDICTION_MODE m,
-                              const int context);
+void vp9_entropy_mode_init();
 
-void vp9_adapt_mode_probs(struct VP9Common *);
+int vp9_mv_cont(const int_mv *l, const int_mv *a);
 
-#define VP9_SWITCHABLE_FILTERS 3 /* number of switchable filters */
+void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);
 
-extern const  INTERPOLATIONFILTERTYPE vp9_switchable_interp
-                  [VP9_SWITCHABLE_FILTERS];
+void vp9_init_mbmode_probs(struct VP9Common *x);
 
-extern const  int vp9_switchable_interp_map[SWITCHABLE + 1];
+void vp9_adapt_mode_context(struct VP9Common *pc);
 
-extern const  int vp9_is_interpolating_filter[SWITCHABLE + 1];
+void vp9_adapt_mode_probs(struct VP9Common *);
 
-extern const  vp9_tree_index vp9_switchable_interp_tree
-                  [2 * (VP9_SWITCHABLE_FILTERS - 1)];
+void vp9_accum_mv_refs(struct VP9Common *pc, MB_PREDICTION_MODE m, int context);
 
-extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
+                                      unsigned int (*ct_32x32p)[2]);
+void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
+                                      unsigned int (*ct_16x16p)[2]);
+void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
+                                    unsigned int (*ct_8x8p)[2]);
 
-extern const  vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
-                                                 [VP9_SWITCHABLE_FILTERS - 1];
-
-extern const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS]
-                                                 [TX_SIZE_MAX_SB - 1];
-extern const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS]
-                                                 [TX_SIZE_MAX_SB - 2];
-extern const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS]
-                                               [TX_SIZE_MAX_SB - 3];
-
-extern void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
-                                             unsigned int (*ct_32x32p)[2]);
-extern void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
-                                             unsigned int (*ct_16x16p)[2]);
-extern void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
-                                           unsigned int (*ct_8x8p)[2]);
 #endif  // VP9_COMMON_VP9_ENTROPYMODE_H_
diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c
index e07e43c..343b624 100644
--- a/libvpx/vp9/common/vp9_entropymv.c
+++ b/libvpx/vp9/common/vp9_entropymv.c
@@ -12,17 +12,12 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_entropymv.h"
 
-//#define MV_COUNT_TESTING
-
 #define MV_COUNT_SAT 20
 #define MV_MAX_UPDATE_FACTOR 128
 
 /* Integer pel reference mv threshold for use of high-precision 1/8 mv */
 #define COMPANDED_MVREF_THRESH    8
 
-/* Smooth or bias the mv-counts before prob computation */
-/* #define SMOOTH_MV_COUNTS */
-
 const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
   -MV_JOINT_ZERO, 2,
   -MV_JOINT_HNZVZ, 4,
@@ -56,7 +51,7 @@ const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {
 };
 struct vp9_token vp9_mv_fp_encodings[4];
 
-const nmv_context vp9_default_nmv_context = {
+static const nmv_context default_nmv_context = {
   {32, 64, 96},
   {
     { /* vert component */
@@ -82,21 +77,10 @@ const nmv_context vp9_default_nmv_context = {
   },
 };
 
-MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) {
-  if (mv->row == 0 && mv->col == 0)
-    return MV_JOINT_ZERO;
-  else if (mv->row == 0 && mv->col != 0)
-    return MV_JOINT_HNZVZ;
-  else if (mv->row != 0 && mv->col == 0)
-    return MV_JOINT_HZVNZ;
-  else
-    return MV_JOINT_HNZVNZ;
-}
-
 #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)
 
 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
-  MV_CLASS_TYPE c;
+  MV_CLASS_TYPE c = MV_CLASS_0;
   if      (z < CLASS0_SIZE * 8)    c = MV_CLASS_0;
   else if (z < CLASS0_SIZE * 16)   c = MV_CLASS_1;
   else if (z < CLASS0_SIZE * 32)   c = MV_CLASS_2;
@@ -114,7 +98,7 @@ MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
   return c;
 }
 
-int vp9_use_nmv_hp(const MV *ref) {
+int vp9_use_mv_hp(const MV *ref) {
   return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
          (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
 }
@@ -123,95 +107,71 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
   return mv_class_base(c) + offset;
 }
 
-static void increment_nmv_component_count(int v,
-                                          nmv_component_counts *mvcomp,
-                                          int incr,
-                                          int usehp) {
-  assert (v != 0);            /* should not be zero */
-  mvcomp->mvcount[MV_MAX + v] += incr;
+static void inc_mv_component_count(int v, nmv_component_counts *comp_counts,
+                                   int incr) {
+  assert (v != 0);
+  comp_counts->mvcount[MV_MAX + v] += incr;
 }
 
-static void increment_nmv_component(int v,
-                                    nmv_component_counts *mvcomp,
-                                    int incr,
-                                    int usehp) {
+static void inc_mv_component(int v, nmv_component_counts *comp_counts,
+                             int incr, int usehp) {
   int s, z, c, o, d, e, f;
   if (!incr)
     return;
   assert (v != 0);            /* should not be zero */
   s = v < 0;
-  mvcomp->sign[s] += incr;
+  comp_counts->sign[s] += incr;
   z = (s ? -v : v) - 1;       /* magnitude - 1 */
 
   c = vp9_get_mv_class(z, &o);
-  mvcomp->classes[c] += incr;
+  comp_counts->classes[c] += incr;
 
   d = (o >> 3);               /* int mv data */
   f = (o >> 1) & 3;           /* fractional pel mv data */
   e = (o & 1);                /* high precision mv data */
   if (c == MV_CLASS_0) {
-    mvcomp->class0[d] += incr;
+    comp_counts->class0[d] += incr;
   } else {
     int i;
     int b = c + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < b; ++i)
-      mvcomp->bits[i][((d >> i) & 1)] += incr;
+      comp_counts->bits[i][((d >> i) & 1)] += incr;
   }
 
   /* Code the fractional pel bits */
   if (c == MV_CLASS_0) {
-    mvcomp->class0_fp[d][f] += incr;
+    comp_counts->class0_fp[d][f] += incr;
   } else {
-    mvcomp->fp[f] += incr;
+    comp_counts->fp[f] += incr;
   }
 
   /* Code the high precision bit */
   if (usehp) {
     if (c == MV_CLASS_0) {
-      mvcomp->class0_hp[e] += incr;
+      comp_counts->class0_hp[e] += incr;
     } else {
-      mvcomp->hp[e] += incr;
+      comp_counts->hp[e] += incr;
     }
   }
 }
 
-#ifdef SMOOTH_MV_COUNTS
-static void smooth_counts(nmv_component_counts *mvcomp) {
-  static const int flen = 3;  // (filter_length + 1) / 2
-  static const int fval[] = {8, 3, 1};
-  static const int fvalbits = 4;
-  int i;
-  unsigned int smvcount[MV_VALS];
-  vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount));
-  smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1;
-  for (i = flen - 1; i <= MV_VALS - flen; ++i) {
-    int j, s = smvcount[i] * fval[0];
-    for (j = 1; j < flen; ++j)
-      s += (smvcount[i - j] + smvcount[i + j]) * fval[j];
-    mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits;
-  }
-}
-#endif
-
 static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
   int v;
   vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
   for (v = 1; v <= MV_MAX; v++) {
-    increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
-    increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
+    inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
+    inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
   }
 }
 
-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
-                       int usehp) {
+void vp9_inc_mv(const MV *mv,  nmv_context_counts *mvctx) {
   const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
   mvctx->joints[j]++;
-  usehp = usehp && vp9_use_nmv_hp(ref);
   if (mv_joint_vertical(j))
-    increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp);
+    inc_mv_component_count(mv->row, &mvctx->comps[0], 1);
 
   if (mv_joint_horizontal(j))
-    increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp);
+    inc_mv_component_count(mv->col, &mvctx->comps[1], 1);
 }
 
 static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) {
@@ -230,79 +190,6 @@ void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
   counts_to_context(&nmv_count->comps[1], usehp);
 }
 
-void vp9_counts_to_nmv_context(
-    nmv_context_counts *nmv_count,
-    nmv_context *prob,
-    int usehp,
-    unsigned int (*branch_ct_joint)[2],
-    unsigned int (*branch_ct_sign)[2],
-    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
-    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
-    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
-    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
-    unsigned int (*branch_ct_fp)[4 - 1][2],
-    unsigned int (*branch_ct_class0_hp)[2],
-    unsigned int (*branch_ct_hp)[2]) {
-  int i, j, k;
-  vp9_counts_process(nmv_count, usehp);
-  vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
-                                   prob->joints,
-                                   branch_ct_joint,
-                                   nmv_count->joints, 0);
-  for (i = 0; i < 2; ++i) {
-    const uint32_t s0 = nmv_count->comps[i].sign[0];
-    const uint32_t s1 = nmv_count->comps[i].sign[1];
-
-    prob->comps[i].sign = get_binary_prob(s0, s1);
-    branch_ct_sign[i][0] = s0;
-    branch_ct_sign[i][1] = s1;
-    vp9_tree_probs_from_distribution(vp9_mv_class_tree,
-                                     prob->comps[i].classes,
-                                     branch_ct_classes[i],
-                                     nmv_count->comps[i].classes, 0);
-    vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
-                                     prob->comps[i].class0,
-                                     branch_ct_class0[i],
-                                     nmv_count->comps[i].class0, 0);
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      const uint32_t b0 = nmv_count->comps[i].bits[j][0];
-      const uint32_t b1 = nmv_count->comps[i].bits[j][1];
-
-      prob->comps[i].bits[j] = get_binary_prob(b0, b1);
-      branch_ct_bits[i][j][0] = b0;
-      branch_ct_bits[i][j][1] = b1;
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    for (k = 0; k < CLASS0_SIZE; ++k) {
-      vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                       prob->comps[i].class0_fp[k],
-                                       branch_ct_class0_fp[i][k],
-                                       nmv_count->comps[i].class0_fp[k], 0);
-    }
-    vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                     prob->comps[i].fp,
-                                     branch_ct_fp[i],
-                                     nmv_count->comps[i].fp, 0);
-  }
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0];
-      const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1];
-      const uint32_t hp0 = nmv_count->comps[i].hp[0];
-      const uint32_t hp1 = nmv_count->comps[i].hp[1];
-
-      prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
-      branch_ct_class0_hp[i][0] = c0_hp0;
-      branch_ct_class0_hp[i][1] = c0_hp1;
-
-      prob->comps[i].hp = get_binary_prob(hp0, hp1);
-      branch_ct_hp[i][0] = hp0;
-      branch_ct_hp[i][1] = hp1;
-    }
-  }
-}
-
 static unsigned int adapt_probs(unsigned int i,
                                 vp9_tree tree,
                                 vp9_prob this_probs[],
@@ -332,110 +219,45 @@ static unsigned int adapt_probs(unsigned int i,
 }
 
 
-void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) {
+void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) {
   int i, j;
-#ifdef MV_COUNT_TESTING
-  printf("joints count: ");
-  for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]);
-  printf("\n"); fflush(stdout);
-  printf("signs count:\n");
-  for (i = 0; i < 2; ++i)
-    printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]);
-  printf("\n"); fflush(stdout);
-  printf("classes count:\n");
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < MV_CLASSES; ++j)
-      printf("%d ", cm->fc.NMVcount.comps[i].classes[j]);
-    printf("\n"); fflush(stdout);
-  }
-  printf("class0 count:\n");
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j)
-      printf("%d ", cm->fc.NMVcount.comps[i].class0[j]);
-    printf("\n"); fflush(stdout);
-  }
-  printf("bits count:\n");
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < MV_OFFSET_BITS; ++j)
-      printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0],
-                       cm->fc.NMVcount.comps[i].bits[j][1]);
-    printf("\n"); fflush(stdout);
-  }
-  printf("class0_fp count:\n");
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      printf("{");
-      for (k = 0; k < 4; ++k)
-        printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]);
-      printf("}, ");
-    }
-    printf("\n"); fflush(stdout);
-  }
-  printf("fp count:\n");
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < 4; ++j)
-      printf("%d ", cm->fc.NMVcount.comps[i].fp[j]);
-    printf("\n"); fflush(stdout);
-  }
-  if (usehp) {
-    printf("class0_hp count:\n");
-    for (i = 0; i < 2; ++i)
-      printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0],
-                       cm->fc.NMVcount.comps[i].class0_hp[1]);
-    printf("\n"); fflush(stdout);
-    printf("hp count:\n");
-    for (i = 0; i < 2; ++i)
-      printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0],
-                       cm->fc.NMVcount.comps[i].hp[1]);
-    printf("\n"); fflush(stdout);
-  }
-#endif
-#ifdef SMOOTH_MV_COUNTS
-  smooth_counts(&cm->fc.NMVcount.comps[0]);
-  smooth_counts(&cm->fc.NMVcount.comps[1]);
-#endif
-  vp9_counts_process(&cm->fc.NMVcount, usehp);
 
-  adapt_probs(0, vp9_mv_joint_tree,
-              cm->fc.nmvc.joints, cm->fc.pre_nmvc.joints,
-              cm->fc.NMVcount.joints);
+  FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+
+  nmv_context *ctx = &cm->fc.nmvc;
+  nmv_context *pre_ctx = &pre_fc->nmvc;
+  nmv_context_counts *cts = &cm->counts.mv;
+
+  vp9_counts_process(cts, usehp);
+
+  adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
 
   for (i = 0; i < 2; ++i) {
-    adapt_prob(&cm->fc.nmvc.comps[i].sign,
-               cm->fc.pre_nmvc.comps[i].sign,
-               cm->fc.NMVcount.comps[i].sign);
-    adapt_probs(0, vp9_mv_class_tree,
-                cm->fc.nmvc.comps[i].classes, cm->fc.pre_nmvc.comps[i].classes,
-                cm->fc.NMVcount.comps[i].classes);
-    adapt_probs(0, vp9_mv_class0_tree,
-                cm->fc.nmvc.comps[i].class0, cm->fc.pre_nmvc.comps[i].class0,
-                cm->fc.NMVcount.comps[i].class0);
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      adapt_prob(&cm->fc.nmvc.comps[i].bits[j],
-                 cm->fc.pre_nmvc.comps[i].bits[j],
-                 cm->fc.NMVcount.comps[i].bits[j]);
-    }
+    adapt_prob(&ctx->comps[i].sign, pre_ctx->comps[i].sign, cts->comps[i].sign);
+    adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes,
+                pre_ctx->comps[i].classes, cts->comps[i].classes);
+    adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0,
+                pre_ctx->comps[i].class0, cts->comps[i].class0);
+
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      adapt_prob(&ctx->comps[i].bits[j], pre_ctx->comps[i].bits[j],
+                 cts->comps[i].bits[j]);
   }
+
   for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      adapt_probs(0, vp9_mv_fp_tree,
-                  cm->fc.nmvc.comps[i].class0_fp[j],
-                  cm->fc.pre_nmvc.comps[i].class0_fp[j],
-                  cm->fc.NMVcount.comps[i].class0_fp[j]);
-    }
-    adapt_probs(0, vp9_mv_fp_tree,
-                cm->fc.nmvc.comps[i].fp,
-                cm->fc.pre_nmvc.comps[i].fp,
-                cm->fc.NMVcount.comps[i].fp);
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j],
+                  pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]);
+
+    adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp,
+                cts->comps[i].fp);
   }
+
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      adapt_prob(&cm->fc.nmvc.comps[i].class0_hp,
-                 cm->fc.pre_nmvc.comps[i].class0_hp,
-                 cm->fc.NMVcount.comps[i].class0_hp);
-      adapt_prob(&cm->fc.nmvc.comps[i].hp,
-                 cm->fc.pre_nmvc.comps[i].hp,
-                 cm->fc.NMVcount.comps[i].hp);
+      adapt_prob(&ctx->comps[i].class0_hp, pre_ctx->comps[i].class0_hp,
+                 cts->comps[i].class0_hp);
+      adapt_prob(&ctx->comps[i].hp, pre_ctx->comps[i].hp, cts->comps[i].hp);
     }
   }
 }
@@ -448,5 +270,5 @@ void vp9_entropy_mv_init() {
 }
 
 void vp9_init_mv_probs(VP9_COMMON *cm) {
-  vpx_memcpy(&cm->fc.nmvc, &vp9_default_nmv_context, sizeof(nmv_context));
+  cm->fc.nmvc = default_nmv_context;
 }
diff --git a/libvpx/vp9/common/vp9_entropymv.h b/libvpx/vp9/common/vp9_entropymv.h
index 15994a6..85a1f3a 100644
--- a/libvpx/vp9/common/vp9_entropymv.h
+++ b/libvpx/vp9/common/vp9_entropymv.h
@@ -21,15 +21,11 @@ struct VP9Common;
 void vp9_entropy_mv_init();
 void vp9_init_mv_probs(struct VP9Common *cm);
 
-void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp);
-int vp9_use_nmv_hp(const MV *ref);
+void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
+int vp9_use_mv_hp(const MV *ref);
 
 #define VP9_NMV_UPDATE_PROB  252
 
-//#define MV_GROUP_UPDATE
-
-#define LOW_PRECISION_MV_UPDATE  /* Use 7 bit forward update */
-
 /* Symbols for coding which components are zero jointly */
 #define MV_JOINTS     4
 typedef enum {
@@ -99,7 +95,14 @@ typedef struct {
   nmv_component comps[2];
 } nmv_context;
 
-MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv);
+static INLINE MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) {
+  if (mv->row == 0) {
+    return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
+  } else {
+    return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
+  }
+}
+
 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);
 int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);
 
@@ -121,22 +124,8 @@ typedef struct {
   nmv_component_counts comps[2];
 } nmv_context_counts;
 
-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
-                       int usehp);
-extern const nmv_context vp9_default_nmv_context;
-void vp9_counts_to_nmv_context(
-    nmv_context_counts *NMVcount,
-    nmv_context *prob,
-    int usehp,
-    unsigned int (*branch_ct_joint)[2],
-    unsigned int (*branch_ct_sign)[2],
-    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
-    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
-    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
-    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
-    unsigned int (*branch_ct_fp)[4 - 1][2],
-    unsigned int (*branch_ct_class0_hp)[2],
-    unsigned int (*branch_ct_hp)[2]);
+void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);
+
 void vp9_counts_process(nmv_context_counts *NMVcount, int usehp);
 
 #endif  // VP9_COMMON_VP9_ENTROPYMV_H_
diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h
index e18d353..86f0d0b 100644
--- a/libvpx/vp9/common/vp9_enums.h
+++ b/libvpx/vp9/common/vp9_enums.h
@@ -14,25 +14,28 @@
 #include "./vpx_config.h"
 
 #define LOG2_MI_SIZE 3
+#define LOG2_MI_BLOCK_SIZE (6 - LOG2_MI_SIZE)  // 64 = 2^6
 
-#define MI_SIZE (1 << LOG2_MI_SIZE)
-#define MI_MASK ((64 >> LOG2_MI_SIZE) - 1)
+#define MI_SIZE (1 << LOG2_MI_SIZE)  // pixels per mi-unit
+#define MI_BLOCK_SIZE (1 << LOG2_MI_BLOCK_SIZE)  // mi-units per max block
+
+#define MI_MASK (MI_BLOCK_SIZE - 1)
 
 typedef enum BLOCK_SIZE_TYPE {
-  BLOCK_SIZE_AB4X4,
-  BLOCK_SIZE_SB4X8,
-  BLOCK_SIZE_SB8X4,
-  BLOCK_SIZE_SB8X8,
-  BLOCK_SIZE_SB8X16,
-  BLOCK_SIZE_SB16X8,
-  BLOCK_SIZE_MB16X16,
-  BLOCK_SIZE_SB16X32,
-  BLOCK_SIZE_SB32X16,
-  BLOCK_SIZE_SB32X32,
-  BLOCK_SIZE_SB32X64,
-  BLOCK_SIZE_SB64X32,
-  BLOCK_SIZE_SB64X64,
-  BLOCK_SIZE_TYPES
+  BLOCK_SIZE_AB4X4, BLOCK_4X4 = BLOCK_SIZE_AB4X4,
+  BLOCK_SIZE_SB4X8, BLOCK_4X8 = BLOCK_SIZE_SB4X8,
+  BLOCK_SIZE_SB8X4, BLOCK_8X4 = BLOCK_SIZE_SB8X4,
+  BLOCK_SIZE_SB8X8, BLOCK_8X8 = BLOCK_SIZE_SB8X8,
+  BLOCK_SIZE_SB8X16, BLOCK_8X16 = BLOCK_SIZE_SB8X16,
+  BLOCK_SIZE_SB16X8, BLOCK_16X8 = BLOCK_SIZE_SB16X8,
+  BLOCK_SIZE_MB16X16, BLOCK_16X16 = BLOCK_SIZE_MB16X16,
+  BLOCK_SIZE_SB16X32, BLOCK_16X32 = BLOCK_SIZE_SB16X32,
+  BLOCK_SIZE_SB32X16, BLOCK_32X16 = BLOCK_SIZE_SB32X16,
+  BLOCK_SIZE_SB32X32, BLOCK_32X32 = BLOCK_SIZE_SB32X32,
+  BLOCK_SIZE_SB32X64, BLOCK_32X64 = BLOCK_SIZE_SB32X64,
+  BLOCK_SIZE_SB64X32, BLOCK_64X32 = BLOCK_SIZE_SB64X32,
+  BLOCK_SIZE_SB64X64, BLOCK_64X64 = BLOCK_SIZE_SB64X64,
+  BLOCK_SIZE_TYPES, BLOCK_MAX_SB_SEGMENTS = BLOCK_SIZE_TYPES
 } BLOCK_SIZE_TYPE;
 
 typedef enum PARTITION_TYPE {
@@ -40,10 +43,34 @@ typedef enum PARTITION_TYPE {
   PARTITION_HORZ,
   PARTITION_VERT,
   PARTITION_SPLIT,
-  PARTITION_TYPES
+  PARTITION_TYPES, PARTITION_INVALID = PARTITION_TYPES
 } PARTITION_TYPE;
 
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
 #define NUM_PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
 
+typedef enum {
+  TX_4X4 = 0,                      // 4x4 dct transform
+  TX_8X8 = 1,                      // 8x8 dct transform
+  TX_16X16 = 2,                    // 16x16 dct transform
+  TX_32X32 = 3,                    // 32x32 dct transform
+  TX_SIZE_MAX_SB,                  // Number of transforms available to SBs
+} TX_SIZE;
+
+typedef enum {
+  ONLY_4X4            = 0,
+  ALLOW_8X8           = 1,
+  ALLOW_16X16         = 2,
+  ALLOW_32X32         = 3,
+  TX_MODE_SELECT      = 4,
+  NB_TXFM_MODES       = 5,
+} TX_MODE;
+
+typedef enum {
+  DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
+  ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
+  DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
+  ADST_ADST = 3                       // ADST in both directions
+} TX_TYPE;
+
 #endif  // VP9_COMMON_VP9_ENUMS_H_
diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c
index a692271..643b229 100644
--- a/libvpx/vp9/common/vp9_findnearmv.c
+++ b/libvpx/vp9/common/vp9_findnearmv.c
@@ -15,7 +15,7 @@
 #include "vp9/common/vp9_sadmxn.h"
 
 static void lower_mv_precision(int_mv *mv, int usehp) {
-  if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) {
+  if (!usehp || !vp9_use_mv_hp(&mv->as_mv)) {
     if (mv->as_mv.row & 1)
       mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
     if (mv->as_mv.col & 1)
diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h
index d4ae210..b0fa505 100644
--- a/libvpx/vp9/common/vp9_findnearmv.h
+++ b/libvpx/vp9/common/vp9_findnearmv.h
@@ -28,18 +28,6 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
                            int_mv *nearest,
                            int_mv *near);
 
-static void mv_bias(int refmb_ref_frame_sign_bias, int refframe,
-                    int_mv *mvp, const int *ref_frame_sign_bias) {
-  MV xmv = mvp->as_mv;
-
-  if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) {
-    xmv.row *= -1;
-    xmv.col *= -1;
-  }
-
-  mvp->as_mv = xmv;
-}
-
 // TODO(jingning): this mv clamping function should be block size dependent.
 static void clamp_mv(int_mv *mv,
                      int mb_to_left_edge,
@@ -61,15 +49,6 @@ static int clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
   return tmp_mv.as_int != mv->as_int;
 }
 
-static int check_mv_bounds(int_mv *mv,
-                           int mb_to_left_edge, int mb_to_right_edge,
-                           int mb_to_top_edge, int mb_to_bottom_edge) {
-  return mv->as_mv.col < mb_to_left_edge ||
-         mv->as_mv.col > mb_to_right_edge ||
-         mv->as_mv.row < mb_to_top_edge ||
-         mv->as_mv.row > mb_to_bottom_edge;
-}
-
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,
                                    MACROBLOCKD *xd,
                                    int_mv *dst_nearest,
@@ -86,13 +65,13 @@ static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
     if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
       return DC_PRED;
     } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
-      return ((cur_mb->bmi + 1 + b)->as_mode.first);
+      return ((cur_mb->bmi + 1 + b)->as_mode);
     } else {
       return cur_mb->mbmi.mode;
     }
   }
   assert(b == 1 || b == 3);
-  return (cur_mb->bmi + b - 1)->as_mode.first;
+  return (cur_mb->bmi + b - 1)->as_mode;
 }
 
 static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
@@ -104,13 +83,13 @@ static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
     if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
       return DC_PRED;
     } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
-      return ((cur_mb->bmi + 2 + b)->as_mode.first);
+      return ((cur_mb->bmi + 2 + b)->as_mode);
     } else {
       return cur_mb->mbmi.mode;
     }
   }
 
-  return (cur_mb->bmi + b - 2)->as_mode.first;
+  return (cur_mb->bmi + b - 2)->as_mode;
 }
 
 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c
index dcc7f03..a95560a 100644
--- a/libvpx/vp9/common/vp9_idct.c
+++ b/libvpx/vp9/common/vp9_idct.c
@@ -124,9 +124,7 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
 
   // Rows
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = input[j];
-    vp9_idct4_1d(temp_in, outptr);
+    vp9_idct4_1d(input, outptr);
     input += 4;
     outptr += 4;
   }
@@ -158,23 +156,6 @@ void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   }
 }
 
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,
-                            uint8_t *dst_ptr, int pitch, int stride) {
-  int a1;
-  int r, c;
-  int16_t out = dct_const_round_shift(input_dc * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++)
-      dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);
-
-    dst_ptr += stride;
-    pred_ptr += pitch;
-  }
-}
-
 static void idct8_1d(int16_t *input, int16_t *output) {
   int16_t step1[8], step2[8];
   int temp1, temp2;
@@ -428,12 +409,11 @@ void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
 
 void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
                                 int dest_stride) {
-  int16_t out[8 * 8];
+  int16_t out[8 * 8] = { 0 };
   int16_t *outptr = out;
   int i, j;
   int16_t temp_in[8], temp_out[8];
 
-  vpx_memset(out, 0, sizeof(out));
   // First transform rows
   // only first 4 row has non-zero coefs
   for (i = 0; i < 4; ++i) {
@@ -535,6 +515,7 @@ static void idct16_1d(int16_t *input, int16_t *output) {
   step1[14] = -step2[14] + step2[15];
   step1[15] = step2[14] + step2[15];
 
+  // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
   step2[0] = dct_const_round_shift(temp1);
@@ -852,15 +833,13 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
 
 void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
                                   int dest_stride) {
-  int16_t out[16 * 16];
+  int16_t out[16 * 16] = { 0 };
   int16_t *outptr = out;
   int i, j;
   int16_t temp_in[16], temp_out[16];
 
-  /* First transform rows. Since all non-zero dct coefficients are in
-   * upper-left 4x4 area, we only need to calculate first 4 rows here.
-   */
-  vpx_memset(out, 0, sizeof(out));
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
   for (i = 0; i < 4; ++i) {
     idct16_1d(input, outptr);
     input += 16;
@@ -1283,15 +1262,13 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
 
 void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,
                                   int dest_stride) {
-  int16_t out[32 * 32];
+  int16_t out[32 * 32] = { 0 };
   int16_t *outptr = out;
   int i, j;
   int16_t temp_in[32], temp_out[32];
 
-  /* First transform rows. Since all non-zero dct coefficients are in
-   * upper-left 4x4 area, we only need to calculate first 4 rows here.
-   */
-  vpx_memset(out, 0, sizeof(out));
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
   for (i = 0; i < 4; ++i) {
     idct32_1d(input, outptr);
     input += 32;
diff --git a/libvpx/vp9/common/vp9_idct.h b/libvpx/vp9/common/vp9_idct.h
index 64f14c9..2d959f0 100644
--- a/libvpx/vp9/common/vp9_idct.h
+++ b/libvpx/vp9/common/vp9_idct.h
@@ -22,10 +22,15 @@
 #define DCT_CONST_BITS 14
 #define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
 
+#define WHT_UPSCALE_FACTOR 2
+
 #define pair_set_epi16(a, b) \
   _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))
 
-// Constants are round(16384 * cos(k*Pi/64)) where k = 1 to 31.
+// Constants:
+//  for (int i = 1; i< 32; ++i)
+//    printf("static const int cospi_%d_64 = %.0f;\n", i,
+//           round(16384 * cos(i*M_PI/64)));
 // Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
 static const int cospi_1_64  = 16364;
 static const int cospi_2_64  = 16305;
diff --git a/libvpx/vp9/common/vp9_implicit_segmentation.c b/libvpx/vp9/common/vp9_implicit_segmentation.c
deleted file mode 100644
index 2a1d35f..0000000
--- a/libvpx/vp9/common/vp9_implicit_segmentation.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/common/vp9_onyxc_int.h"
-
-#define MAX_REGIONS 24000
-#ifndef NULL
-#define NULL 0
-#endif
-
-#define min_mbs_in_region 3
-
-// this linked list structure holds equivalences for connected
-// component labeling
-struct list_el {
-  int label;
-  int seg_value;
-  int count;
-  struct list_el *next;
-};
-typedef struct list_el item;
-
-// connected colorsegments
-typedef struct {
-  int min_x;
-  int min_y;
-  int max_x;
-  int max_y;
-  int64_t sum_x;
-  int64_t sum_y;
-  int pixels;
-  int seg_value;
-  int label;
-} segment_info;
-
-
-typedef enum {
-  SEGMENT_MODE,
-  SEGMENT_MV,
-  SEGMENT_REFFRAME,
-  SEGMENT_SKIPPED
-} SEGMENT_TYPE;
-
-
-// this merges the two equivalence lists and
-// then makes sure that every label points to the same
-// equivalence list
-void merge(item *labels, int u, int v) {
-  item *a = labels[u].next;
-  item *b = labels[v].next;
-  item c;
-  item *it = &c;
-  int count;
-
-  // check if they are already merged
-  if (u == v || a == b)
-    return;
-
-  count = a->count + b->count;
-
-  // merge 2 sorted linked lists.
-  while (a != NULL && b != NULL) {
-    if (a->label < b->label) {
-      it->next = a;
-      a = a->next;
-    } else {
-      it->next = b;
-      b = b->next;
-    }
-
-    it = it->next;
-  }
-
-  if (a == NULL)
-    it->next = b;
-  else
-    it->next = a;
-
-  it = c.next;
-
-  // make sure every equivalence in the linked list points to this new ll
-  while (it != NULL) {
-    labels[it->label].next = c.next;
-    it = it->next;
-  }
-  c.next->count = count;
-
-}
-
-void segment_via_mode_info(VP9_COMMON *oci, int how) {
-  MODE_INFO *mi = oci->mi;
-  int i, j;
-  int mb_index = 0;
-
-  int label = 1;
-  int pitch = oci->mb_cols;
-
-  // holds linked list equivalences
-  // the max should probably be allocated at a higher level in oci
-  item equivalences[MAX_REGIONS];
-  int eq_ptr = 0;
-  item labels[MAX_REGIONS];
-  segment_info segments[MAX_REGIONS];
-  int label_count = 1;
-  int labeling[400 * 300];
-  int *lp = labeling;
-
-  label_count = 1;
-  memset(labels, 0, sizeof(labels));
-  memset(segments, 0, sizeof(segments));
-
-  /* Go through each macroblock first pass labelling */
-  for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
-    for (j = 0; j < oci->mb_cols; j++) {
-      // int above seg_value, left seg_value, this seg_value...
-      int a = -1, l = -1, n = -1;
-
-      // above label, left label
-      int al = -1, ll = -1;
-      if (i) {
-        al = lp[j - pitch];
-        a = labels[al].next->seg_value;
-      }
-      if (j) {
-        ll = lp[j - 1];
-        l = labels[ll].next->seg_value;
-      }
-
-      // what setting are we going to do the implicit segmentation on
-      switch (how) {
-        case SEGMENT_MODE:
-          n = mi[mb_index].mbmi.mode;
-          break;
-        case SEGMENT_MV:
-          n = mi[mb_index].mbmi.mv[0].as_int;
-          if (mi[mb_index].mbmi.ref_frame[0] == INTRA_FRAME)
-            n = -9999999;
-          break;
-        case SEGMENT_REFFRAME:
-          n = mi[mb_index].mbmi.ref_frame[0];
-          break;
-        case SEGMENT_SKIPPED:
-          n = mi[mb_index].mbmi.mb_skip_coeff;
-          break;
-      }
-
-      // above and left both have the same seg_value
-      if (n == a && n == l) {
-        // pick the lowest label
-        lp[j] = (al < ll ? al : ll);
-        labels[lp[j]].next->count++;
-
-        // merge the above and left equivalencies
-        merge(labels, al, ll);
-      }
-      // this matches above seg_value
-      else if (n == a) {
-        // give it the same label as above
-        lp[j] = al;
-        labels[al].next->count++;
-      }
-      // this matches left seg_value
-      else if (n == l) {
-        // give it the same label as above
-        lp[j] = ll;
-        labels[ll].next->count++;
-      } else {
-        // new label doesn't match either
-        item *e = &labels[label];
-        item *nl = &equivalences[eq_ptr++];
-        lp[j] = label;
-        nl->label = label;
-        nl->next = 0;
-        nl->seg_value = n;
-        nl->count = 1;
-        e->next = nl;
-        label++;
-      }
-      mb_index++;
-    }
-    mb_index++;
-  }
-  lp = labeling;
-
-  // give new labels to regions
-  for (i = 1; i < label; i++)
-    if (labels[i].next->count > min_mbs_in_region &&
-        labels[labels[i].next->label].label == 0) {
-      segment_info *cs = &segments[label_count];
-      cs->label = label_count;
-      labels[labels[i].next->label].label = label_count++;
-      labels[labels[i].next->label].seg_value = labels[i].next->seg_value;
-      cs->seg_value = labels[labels[i].next->label].seg_value;
-      cs->min_x = oci->mb_cols;
-      cs->min_y = oci->mb_rows;
-      cs->max_x = 0;
-      cs->max_y = 0;
-      cs->sum_x = 0;
-      cs->sum_y = 0;
-      cs->pixels = 0;
-    }
-
-  lp = labeling;
-
-  // this is just to gather stats...
-  for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
-    for (j = 0; j < oci->mb_cols; j++) {
-      const int old_lab = labels[lp[j]].next->label;
-      const int lab = labels[old_lab].label;
-      segment_info *cs = &segments[lab];
-
-      cs->min_x = MIN(cs->min_x, j);
-      cs->max_x = MAX(cs->max_x, j);
-      cs->min_y = MIN(cs->min_y, i);
-      cs->max_y = MAX(cs->max_y, i);
-      cs->sum_x += j;
-      cs->sum_y += i;
-      cs->pixels++;
-
-      lp[j] = lab;
-      mb_index++;
-    }
-    mb_index++;
-  }
-
-  {
-    lp = labeling;
-    printf("labelling \n");
-    mb_index = 0;
-    for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
-      for (j = 0; j < oci->mb_cols; j++) {
-        printf("%4d", lp[j]);
-      }
-      printf("            ");
-      for (j = 0; j < oci->mb_cols; j++, mb_index++) {
-        // printf("%3d",mi[mb_index].mbmi.mode );
-        printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row,
-            mi[mb_index].mbmi.mv[0].as_mv.col);
-      }
-      printf("\n");
-      ++mb_index;
-    }
-    printf("\n");
-  }
-}
-
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index 7b3f0be..5498b17 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -33,18 +33,13 @@ static void lf_init_lut(loop_filter_info_n *lfi) {
   lfi->mode_lf_lut[NEWMV] = 1;
 }
 
-void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
-                                      int sharpness_lvl) {
-  int i;
-
-  /* For each possible value for the loop filter fill out limits */
-  for (i = 0; i <= MAX_LOOP_FILTER; i++) {
-    int filt_lvl = i;
-    int block_inside_limit = 0;
+static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) {
+  int lvl;
 
-    /* Set loop filter paramaeters that control sharpness. */
-    block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
-    block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
+  // For each possible value for the loop filter fill out limits
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
+    // Set loop filter paramaeters that control sharpness.
+    int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
 
     if (sharpness_lvl > 0) {
       if (block_inside_limit > (9 - sharpness_lvl))
@@ -54,21 +49,19 @@ void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
     if (block_inside_limit < 1)
       block_inside_limit = 1;
 
-    vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
-    vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
-               SIMD_WIDTH);
-    vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+    vpx_memset(lfi->lim[lvl], block_inside_limit, SIMD_WIDTH);
+    vpx_memset(lfi->mblim[lvl], (2 * (lvl + 2) + block_inside_limit),
                SIMD_WIDTH);
   }
 }
 
-void vp9_loop_filter_init(VP9_COMMON *cm) {
+void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) {
   loop_filter_info_n *lfi = &cm->lf_info;
   int i;
 
   // init limits for given sharpness
-  vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
-  cm->last_sharpness_level = cm->sharpness_level;
+  update_sharpness(lfi, lf->sharpness_level);
+  lf->last_sharpness_level = lf->sharpness_level;
 
   // init LUT for lvl  and hev thr picking
   lf_init_lut(lfi);
@@ -78,98 +71,68 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
     vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
 }
 
-void vp9_loop_filter_frame_init(VP9_COMMON *cm,
-                                MACROBLOCKD *xd,
+void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                 int default_filt_lvl) {
-  int seg,    // segment number
-      ref,    // index in ref_lf_deltas
-      mode;   // index in mode_lf_deltas
+  int seg;
   // n_shift is the a multiplier for lf_deltas
   // the multiplier is 1 for when filter_lvl is between 0 and 31;
   // 2 when filter_lvl is between 32 and 63
-  int n_shift = default_filt_lvl >> 5;
-
-  loop_filter_info_n *lfi = &cm->lf_info;
-
-  /* update limits if sharpness has changed */
-  // printf("vp9_loop_filter_frame_init %d\n", default_filt_lvl);
-  // printf("sharpness level: %d [%d]\n",
-  //        cm->sharpness_level, cm->last_sharpness_level);
-  if (cm->last_sharpness_level != cm->sharpness_level) {
-    vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
-    cm->last_sharpness_level = cm->sharpness_level;
+  const int n_shift = default_filt_lvl >> 5;
+  loop_filter_info_n *const lfi = &cm->lf_info;
+  struct loopfilter *lf = &xd->lf;
+
+  // update limits if sharpness has changed
+  if (lf->last_sharpness_level != lf->sharpness_level) {
+    update_sharpness(lfi, lf->sharpness_level);
+    lf->last_sharpness_level = lf->sharpness_level;
   }
 
-  for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) {
-    int lvl_seg = default_filt_lvl;
-    int lvl_ref, lvl_mode;
-
+  for (seg = 0; seg < MAX_SEGMENTS; seg++) {
+    int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;
 
     // Set the baseline filter values for each segment
-    if (vp9_segfeature_active(xd, seg, SEG_LVL_ALT_LF)) {
-      /* Abs value */
-      if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
-        lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
-      } else { /* Delta Value */
-        lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
-        lvl_seg = clamp(lvl_seg, 0, 63);
-      }
+    if (vp9_segfeature_active(&xd->seg, seg, SEG_LVL_ALT_LF)) {
+      const int data = vp9_get_segdata(&xd->seg, seg, SEG_LVL_ALT_LF);
+      lvl_seg = xd->seg.abs_delta == SEGMENT_ABSDATA
+                  ? data
+                  : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER);
     }
 
-    if (!xd->mode_ref_lf_delta_enabled) {
-      /* we could get rid of this if we assume that deltas are set to
-       * zero when not in use; encoder always uses deltas
-       */
+    if (!lf->mode_ref_delta_enabled) {
+      // we could get rid of this if we assume that deltas are set to
+      // zero when not in use; encoder always uses deltas
       vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4);
       continue;
     }
 
-    lvl_ref = lvl_seg;
+    intra_lvl = lvl_seg + (lf->ref_deltas[INTRA_FRAME] << n_shift);
+    lfi->lvl[seg][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
 
-    /* INTRA_FRAME */
-    ref = INTRA_FRAME;
-
-    /* Apply delta for reference frame */
-    lvl_ref += xd->ref_lf_deltas[ref] << n_shift;
-
-    mode = 0; /* all the rest of Intra modes */
-    lvl_mode = lvl_ref;
-    lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63);
-
-    /* LAST, GOLDEN, ALT */
-    for (ref = 1; ref < MAX_REF_FRAMES; ref++) {
-      int lvl_ref = lvl_seg;
-
-      /* Apply delta for reference frame */
-      lvl_ref += xd->ref_lf_deltas[ref] << n_shift;
-
-      /* Apply delta for Inter modes */
-      for (mode = 0; mode < MAX_MODE_LF_DELTAS; mode++) {
-        lvl_mode = lvl_ref + (xd->mode_lf_deltas[mode] << n_shift);
-        lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63);
+    for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref)
+      for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+        const int inter_lvl = lvl_seg + (lf->ref_deltas[ref] << n_shift)
+                                      + (lf->mode_deltas[mode] << n_shift);
+        lfi->lvl[seg][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
       }
-    }
   }
 }
 
-static int build_lfi(const VP9_COMMON *cm, const MB_MODE_INFO *mbmi,
-                      struct loop_filter_info *lfi) {
-  const loop_filter_info_n *lfi_n = &cm->lf_info;
-  int mode = mbmi->mode;
-  int mode_index = lfi_n->mode_lf_lut[mode];
-  int seg = mbmi->segment_id;
-  int ref_frame = mbmi->ref_frame[0];
-  int filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
-
-  if (filter_level) {
-    const int hev_index = filter_level >> 4;
+static int build_lfi(const loop_filter_info_n *const lfi_n,
+                     const MB_MODE_INFO *const mbmi,
+                     struct loop_filter_info *const lfi) {
+  const int seg = mbmi->segment_id;
+  const int ref = mbmi->ref_frame[0];
+  const int mode = lfi_n->mode_lf_lut[mbmi->mode];
+  const int filter_level = lfi_n->lvl[seg][ref][mode];
+
+  if (filter_level > 0) {
     lfi->mblim = lfi_n->mblim[filter_level];
-    lfi->blim = lfi_n->blim[filter_level];
     lfi->lim = lfi_n->lim[filter_level];
-    lfi->hev_thr = lfi_n->hev_thr[hev_index];
+    lfi->hev_thr = lfi_n->hev_thr[filter_level >> 4];
     return 1;
+  } else {
+    return 0;
   }
-  return 0;
 }
 
 static void filter_selectively_vert(uint8_t *s, int pitch,
@@ -180,7 +143,8 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
                                     const struct loop_filter_info *lfi) {
   unsigned int mask;
 
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= 1) {
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= 1) {
     if (mask & 1) {
       if (mask_16x16 & 1) {
         vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
@@ -198,14 +162,11 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
                                       lfi->hev_thr, 1);
         assert(!(mask_16x16 & 1));
         assert(!(mask_8x8 & 1));
-      } else {
-        assert(0);
       }
-
-      if (mask_4x4_int & 1)
-        vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1);
     }
+    if (mask_4x4_int & 1)
+      vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1);
     s += 8;
     lfi++;
     mask_16x16 >>= 1;
@@ -223,13 +184,22 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
                                      int only_4x4_1,
                                      const struct loop_filter_info *lfi) {
   unsigned int mask;
+  int count;
 
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= 1) {
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= count) {
+    count = 1;
     if (mask & 1) {
       if (!only_4x4_1) {
         if (mask_16x16 & 1) {
-          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr);
+          if ((mask_16x16 & 3) == 3) {
+            vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+                                         lfi->hev_thr, 2);
+            count = 2;
+          } else {
+            vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+                                         lfi->hev_thr, 1);
+          }
           assert(!(mask_8x8 & 1));
           assert(!(mask_4x4 & 1));
           assert(!(mask_4x4_int & 1));
@@ -243,8 +213,6 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
                                           lfi->hev_thr, 1);
           assert(!(mask_16x16 & 1));
           assert(!(mask_8x8 & 1));
-        } else {
-          assert(0);
         }
       }
 
@@ -252,40 +220,41 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
         vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
                                         lfi->lim, lfi->hev_thr, 1);
     }
-    s += 8;
-    lfi++;
-    mask_16x16 >>= 1;
-    mask_8x8 >>= 1;
-    mask_4x4 >>= 1;
-    mask_4x4_int >>= 1;
+    s += 8 * count;
+    lfi += count;
+    mask_16x16 >>= count;
+    mask_8x8 >>= count;
+    mask_4x4 >>= count;
+    mask_4x4_int >>= count;
   }
 }
 
-static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
-                               int plane, int mi_row, int mi_col) {
-  const int ss_x = xd->plane[plane].subsampling_x;
-  const int ss_y = xd->plane[plane].subsampling_y;
-  const int row_step = 1 << xd->plane[plane].subsampling_y;
-  const int col_step = 1 << xd->plane[plane].subsampling_x;
-  struct buf_2d * const dst = &xd->plane[plane].dst;
+static void filter_block_plane(VP9_COMMON *const cm,
+                               struct macroblockd_plane *const plane,
+                               const MODE_INFO *mi,
+                               int mi_row, int mi_col) {
+  const int ss_x = plane->subsampling_x;
+  const int ss_y = plane->subsampling_y;
+  const int row_step = 1 << ss_x;
+  const int col_step = 1 << ss_y;
+  const int row_step_stride = cm->mode_info_stride * row_step;
+  struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
-  MODE_INFO* const mi0 = xd->mode_info_context;
-  unsigned int mask_16x16[64 / MI_SIZE] = {0};
-  unsigned int mask_8x8[64 / MI_SIZE] = {0};
-  unsigned int mask_4x4[64 / MI_SIZE] = {0};
-  unsigned int mask_4x4_int[64 / MI_SIZE] = {0};
-  struct loop_filter_info lfi[64 / MI_SIZE][64 / MI_SIZE];
+  unsigned int mask_16x16[MI_BLOCK_SIZE] = {0};
+  unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
+  unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
+  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
+  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
   int r, c;
 
-  for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
     unsigned int mask_16x16_c = 0;
     unsigned int mask_8x8_c = 0;
     unsigned int mask_4x4_c = 0;
     unsigned int border_mask;
 
     // Determine the vertical edges that need filtering
-    for (c = 0; c < 64 / MI_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
-      const MODE_INFO * const mi = xd->mode_info_context;
+    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
       const int skip_this = mi[c].mbmi.mb_skip_coeff
                             && mi[c].mbmi.ref_frame[0] != INTRA_FRAME;
       // left edge of current unit is block/partition edge -> no skip
@@ -296,14 +265,14 @@ static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
       const int block_edge_above = b_height_log2(mi[c].mbmi.sb_type) ?
           !(r & ((1 << (b_height_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
       const int skip_this_r = skip_this && !block_edge_above;
-      const TX_SIZE tx_size = plane ? get_uv_tx_size(&mi[c].mbmi)
-                                    : mi[c].mbmi.txfm_size;
+      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
+                            ? get_uv_tx_size(&mi[c].mbmi)
+                            : mi[c].mbmi.txfm_size;
       const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
       // Filter level can vary per MI
-      if (!build_lfi(cm, &mi[c].mbmi,
-                     lfi[r] + (c >> xd->plane[plane].subsampling_x)))
+      if (!build_lfi(&cm->lf_info, &mi[c].mbmi, lfi[r] + (c >> ss_x)))
         continue;
 
       // Build masks based on the transform size of each block
@@ -362,13 +331,12 @@ static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
                             mask_4x4_c & border_mask,
                             mask_4x4_int[r], lfi[r]);
     dst->buf += 8 * dst->stride;
-    xd->mode_info_context += cm->mode_info_stride * row_step;
+    mi += row_step_stride;
   }
 
   // Now do horizontal pass
   dst->buf = dst0;
-  xd->mode_info_context = mi0;
-  for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
     const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
     const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
 
@@ -378,30 +346,33 @@ static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
                              mask_4x4[r],
                              mask_4x4_int_r, mi_row + r == 0, lfi[r]);
     dst->buf += 8 * dst->stride;
-    xd->mode_info_context += cm->mode_info_stride * row_step;
   }
 }
 
-void vp9_loop_filter_frame(VP9_COMMON *cm,
-                           MACROBLOCKD *xd,
-                           int frame_filter_level,
-                           int y_only) {
+void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+                          VP9_COMMON *cm, MACROBLOCKD *xd,
+                          int start, int stop, int y_only) {
+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
 
-  // Initialize the loop filter for this frame.
-  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
-
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 64 / MI_SIZE) {
+  for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
     MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
 
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 64 / MI_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
       int plane;
 
-      setup_dst_planes(xd, cm->frame_to_show, mi_row, mi_col);
-      for (plane = 0; plane < (y_only ? 1 : MAX_MB_PLANE); plane++) {
-        xd->mode_info_context = mi + mi_col;
-        filter_block_plane(cm, xd, plane, mi_row, mi_col);
+      setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
+      for (plane = 0; plane < num_planes; ++plane) {
+        filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col);
       }
     }
   }
 }
+
+void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
+                           int frame_filter_level, int y_only) {
+  if (!frame_filter_level) return;
+  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
+  vp9_loop_filter_rows(cm->frame_to_show, cm, xd,
+                       0, cm->mi_rows, y_only);
+}
diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h
index ce954c0..e59cc64 100644
--- a/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libvpx/vp9/common/vp9_loopfilter.h
@@ -13,61 +13,46 @@
 
 #include "vpx_ports/mem.h"
 #include "vpx_config.h"
+
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_seg_common.h"
 
 #define MAX_LOOP_FILTER 63
+#define MAX_SHARPNESS 7
+
 #define SIMD_WIDTH 16
 
-/* Need to align this structure so when it is declared and
- * passed it can be loaded into vector registers.
- */
+// Need to align this structure so when it is declared and
+// passed it can be loaded into vector registers.
 typedef struct {
-  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
                   mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
-                  blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
                   lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
                   hev_thr[4][SIMD_WIDTH]);
-  unsigned char lvl[MAX_MB_SEGMENTS][4][4];
-  unsigned char mode_lf_lut[MB_MODE_COUNT];
+  uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
+  uint8_t mode_lf_lut[MB_MODE_COUNT];
 } loop_filter_info_n;
 
 struct loop_filter_info {
-  const unsigned char *mblim;
-  const unsigned char *blim;
-  const unsigned char *lim;
-  const unsigned char *hev_thr;
+  const uint8_t *mblim;
+  const uint8_t *lim;
+  const uint8_t *hev_thr;
 };
 
-#define prototype_loopfilter(sym) \
-  void sym(uint8_t *src, int pitch, const unsigned char *blimit, \
-           const unsigned char *limit, const unsigned char *thresh, int count)
-
-#define prototype_loopfilter_block(sym) \
-  void sym(uint8_t *y, uint8_t *u, uint8_t *v, \
-           int ystride, int uv_stride, struct loop_filter_info *lfi)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/vp9_loopfilter_x86.h"
-#endif
-
-typedef void loop_filter_uvfunction(uint8_t *u,   /* source pointer */
-                                    int p,              /* pitch */
-                                    const unsigned char *blimit,
-                                    const unsigned char *limit,
-                                    const unsigned char *thresh,
-                                    uint8_t *v);
 
 /* assorted loopfilter functions which get used elsewhere */
 struct VP9Common;
 struct macroblockd;
 
-void vp9_loop_filter_init(struct VP9Common *cm);
+void vp9_loop_filter_init(struct VP9Common *cm, struct loopfilter *lf);
 
-void vp9_loop_filter_frame_init(struct VP9Common *cm,
-                                struct macroblockd *mbd,
+// Update the loop filter for the current frame.
+// This should be called before vp9_loop_filter_rows(), vp9_loop_filter_frame()
+// calls this function directly.
+void vp9_loop_filter_frame_init(struct VP9Common *const cm,
+                                struct macroblockd *const xd,
                                 int default_filt_lvl);
 
 void vp9_loop_filter_frame(struct VP9Common *cm,
@@ -75,11 +60,8 @@ void vp9_loop_filter_frame(struct VP9Common *cm,
                            int filter_level,
                            int y_only);
 
-void vp9_loop_filter_partial_frame(struct VP9Common *cm,
-                                   struct macroblockd *mbd,
-                                   int default_filt_lvl);
-
-void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
-                                      int sharpness_lvl);
-
+// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
+void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+                          struct VP9Common *cm, struct macroblockd *xd,
+                          int start, int stop, int y_only);
 #endif  // VP9_COMMON_VP9_LOOPFILTER_H_
diff --git a/libvpx/vp9/common/vp9_loopfilter_filters.c b/libvpx/vp9/common/vp9_loopfilter_filters.c
index 0efbcaf..88130d8 100644
--- a/libvpx/vp9/common/vp9_loopfilter_filters.c
+++ b/libvpx/vp9/common/vp9_loopfilter_filters.c
@@ -34,17 +34,44 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
   return ~mask;
 }
 
+static INLINE int8_t flat_mask4(uint8_t thresh,
+                                uint8_t p3, uint8_t p2,
+                                uint8_t p1, uint8_t p0,
+                                uint8_t q0, uint8_t q1,
+                                uint8_t q2, uint8_t q3) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > thresh) * -1;
+  mask |= (abs(q1 - q0) > thresh) * -1;
+  mask |= (abs(p2 - p0) > thresh) * -1;
+  mask |= (abs(q2 - q0) > thresh) * -1;
+  mask |= (abs(p3 - p0) > thresh) * -1;
+  mask |= (abs(q3 - q0) > thresh) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t flat_mask5(uint8_t thresh,
+                                uint8_t p4, uint8_t p3,
+                                uint8_t p2, uint8_t p1,
+                                uint8_t p0, uint8_t q0,
+                                uint8_t q1, uint8_t q2,
+                                uint8_t q3, uint8_t q4) {
+  int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
+  mask |= (abs(p4 - p0) > thresh) * -1;
+  mask |= (abs(q4 - q0) > thresh) * -1;
+  return ~mask;
+}
+
 // is there high edge variance internal edge: 11111111 yes, 00000000 no
-static INLINE int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,
-                             uint8_t q0, uint8_t q1) {
+static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
+                              uint8_t q0, uint8_t q1) {
   int8_t hev = 0;
   hev  |= (abs(p1 - p0) > thresh) * -1;
   hev  |= (abs(q1 - q0) > thresh) * -1;
   return hev;
 }
 
-static INLINE void filter(int8_t mask, uint8_t hev, uint8_t *op1,
-                          uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1,
+                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
   int8_t filter1, filter2;
 
   const int8_t ps1 = (int8_t) *op1 ^ 0x80;
@@ -68,7 +95,7 @@ static INLINE void filter(int8_t mask, uint8_t hev, uint8_t *op1,
   *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
 
   // outer tap adjustments
-  filter = ((filter1 + 1) >> 1) & ~hev;
+  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
 
   *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
   *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
@@ -88,8 +115,8 @@ void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */,
     const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
     const int8_t mask = filter_mask(*limit, *blimit,
                                     p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
-    filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+    filter4(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
     ++s;
   }
 }
@@ -108,57 +135,30 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
     const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
     const int8_t mask = filter_mask(*limit, *blimit,
                                     p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
-    filter(mask, hev, s - 2, s - 1, s, s + 1);
+    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+    filter4(mask, hev, s - 2, s - 1, s, s + 1);
     s += pitch;
   }
 }
 
-static INLINE int8_t flatmask4(uint8_t thresh,
-                               uint8_t p3, uint8_t p2,
-                               uint8_t p1, uint8_t p0,
-                               uint8_t q0, uint8_t q1,
-                               uint8_t q2, uint8_t q3) {
-  int8_t flat = 0;
-  flat |= (abs(p1 - p0) > thresh) * -1;
-  flat |= (abs(q1 - q0) > thresh) * -1;
-  flat |= (abs(p0 - p2) > thresh) * -1;
-  flat |= (abs(q0 - q2) > thresh) * -1;
-  flat |= (abs(p3 - p0) > thresh) * -1;
-  flat |= (abs(q3 - q0) > thresh) * -1;
-  return ~flat;
-}
-static INLINE signed char flatmask5(uint8_t thresh,
-                                    uint8_t p4, uint8_t p3, uint8_t p2,
-                                    uint8_t p1, uint8_t p0,
-                                    uint8_t q0, uint8_t q1, uint8_t q2,
-                                    uint8_t q3, uint8_t q4) {
-  int8_t flat = 0;
-  flat |= (abs(p4 - p0) > thresh) * -1;
-  flat |= (abs(q4 - q0) > thresh) * -1;
-  flat = ~flat;
-  return flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
-}
-
-
-static INLINE void mbfilter(int8_t mask, uint8_t hev, uint8_t flat,
-                            uint8_t *op3, uint8_t *op2,
-                            uint8_t *op1, uint8_t *op0,
-                            uint8_t *oq0, uint8_t *oq1,
-                            uint8_t *oq2, uint8_t *oq3) {
-  // use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line
+static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat,
+                           uint8_t *op3, uint8_t *op2,
+                           uint8_t *op1, uint8_t *op0,
+                           uint8_t *oq0, uint8_t *oq1,
+                           uint8_t *oq2, uint8_t *oq3) {
   if (flat && mask) {
     const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
     const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
 
-    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3);
-    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3);
-    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3);
-    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3);
+    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
   } else {
-    filter(mask, hev, op1,  op0, oq0, oq1);
+    filter4(mask, hev, op1,  op0, oq0, oq1);
   }
 }
 
@@ -177,11 +177,10 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p,
 
     const int8_t mask = filter_mask(*limit, *blimit,
                                     p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
-    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    mbfilter(mask, hev, flat,
-             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-             s,         s + 1 * p, s + 2 * p, s + 3 * p);
+    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    filter8(mask, hev, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                             s,         s + 1 * p, s + 2 * p, s + 3 * p);
     ++s;
   }
 }
@@ -198,23 +197,24 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     const int8_t mask = filter_mask(*limit, *blimit,
                                     p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hevmask(thresh[0], p1, p0, q0, q1);
-    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    mbfilter(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
-                              s,     s + 1, s + 2, s + 3);
+    const int8_t hev = hev_mask(thresh[0], p1, p0, q0, q1);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    filter8(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
+                             s,     s + 1, s + 2, s + 3);
     s += pitch;
   }
 }
 
-static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
-                                 uint8_t flat, uint8_t flat2,
-                                 uint8_t *op7, uint8_t *op6, uint8_t *op5,
-                                 uint8_t *op4, uint8_t *op3, uint8_t *op2,
-                                 uint8_t *op1, uint8_t *op0, uint8_t *oq0,
-                                 uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
-                                 uint8_t *oq4, uint8_t *oq5, uint8_t *oq6,
-                                 uint8_t *oq7) {
-  // use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line
+static INLINE void filter16(int8_t mask, uint8_t hev,
+                            uint8_t flat, uint8_t flat2,
+                            uint8_t *op7, uint8_t *op6,
+                            uint8_t *op5, uint8_t *op4,
+                            uint8_t *op3, uint8_t *op2,
+                            uint8_t *op1, uint8_t *op0,
+                            uint8_t *oq0, uint8_t *oq1,
+                            uint8_t *oq2, uint8_t *oq3,
+                            uint8_t *oq4, uint8_t *oq5,
+                            uint8_t *oq6, uint8_t *oq7) {
   if (flat2 && flat && mask) {
     const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
                   p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
@@ -222,6 +222,7 @@ static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
     const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
                   q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
 
+    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
     *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
                               q0, 4);
     *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
@@ -251,35 +252,35 @@ static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
     *oq6 = ROUND_POWER_OF_TWO(p0 +
                               q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
   } else {
-    mbfilter(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+    filter8(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
   }
 }
 
 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
-                                    const uint8_t *thresh) {
+                                    const uint8_t *thresh,
+                                    int count) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < 8 * count; ++i) {
     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
     const int8_t mask = filter_mask(*limit, *blimit,
                                     p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
-    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flatmask5(1,
+    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask5(1,
                              s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
                              q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
 
-    wide_mbfilter(mask, hev, flat, flat2,
-                  s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
-                  s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-                  s,         s + 1 * p, s + 2 * p, s + 3 * p,
-                  s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
-
+    filter16(mask, hev, flat, flat2,
+             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
+             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+             s,         s + 1 * p, s + 2 * p, s + 3 * p,
+             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
     ++s;
   }
 }
@@ -295,14 +296,14 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p,
     const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
     const int8_t mask = filter_mask(*limit, *blimit,
                                     p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
-    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flatmask5(1, s[-8], s[-7], s[-6], s[-5], p0,
-                                   q0, s[4], s[5], s[6], s[7]);
-
-    wide_mbfilter(mask, hev, flat, flat2,
-                  s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
-                  s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
+    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+                                    q0, s[4], s[5], s[6], s[7]);
+
+    filter16(mask, hev, flat, flat2,
+             s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+             s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
     s += p;
   }
 }
diff --git a/libvpx/vp9/common/vp9_maskingmv.c b/libvpx/vp9/common/vp9_maskingmv.c
deleted file mode 100644
index 326201b..0000000
--- a/libvpx/vp9/common/vp9_maskingmv.c
+++ /dev/null
@@ -1,803 +0,0 @@
-/*
- ============================================================================
- Name        : vp9_maskingmv.c
- Author      : jimbankoski
- Version     :
- Copyright   : Your copyright notice
- Description : Hello World in C, Ansi-style
- ============================================================================
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-unsigned int vp9_sad16x16_sse3(
-  unsigned char *src_ptr,
-  int  src_stride,
-  unsigned char *ref_ptr,
-  int  ref_stride,
-  int  max_err);
-
-int vp8_growmaskmb_sse3(
-  unsigned char *om,
-  unsigned char *nm);
-
-void vp8_makemask_sse3(
-  unsigned char *y,
-  unsigned char *u,
-  unsigned char *v,
-  unsigned char *ym,
-  int yp,
-  int uvp,
-  int ys,
-  int us,
-  int vs,
-  int yt,
-  int ut,
-  int vt);
-
-unsigned int vp9_sad16x16_unmasked_wmt(
-  unsigned char *src_ptr,
-  int  src_stride,
-  unsigned char *ref_ptr,
-  int  ref_stride,
-  unsigned char *mask);
-
-unsigned int vp9_sad16x16_masked_wmt(
-  unsigned char *src_ptr,
-  int  src_stride,
-  unsigned char *ref_ptr,
-  int  ref_stride,
-  unsigned char *mask);
-
-unsigned int vp8_masked_predictor_wmt(
-  unsigned char *masked,
-  unsigned char *unmasked,
-  int  src_stride,
-  unsigned char *dst_ptr,
-  int  dst_stride,
-  unsigned char *mask);
-unsigned int vp8_masked_predictor_uv_wmt(
-  unsigned char *masked,
-  unsigned char *unmasked,
-  int  src_stride,
-  unsigned char *dst_ptr,
-  int  dst_stride,
-  unsigned char *mask);
-unsigned int vp8_uv_from_y_mask(
-  unsigned char *ymask,
-  unsigned char *uvmask);
-int yp = 16;
-unsigned char sxy[] = {
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90
-};
-
-unsigned char sts[] = {
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-};
-unsigned char str[] = {
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-};
-
-unsigned char y[] = {
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40
-};
-int uvp = 8;
-unsigned char u[] = {
-  90, 80, 70, 70, 90, 90, 90, 17,
-  90, 80, 70, 70, 90, 90, 90, 17,
-  84, 70, 70, 90, 90, 90, 17, 17,
-  84, 70, 70, 90, 90, 90, 17, 17,
-  80, 70, 70, 90, 90, 90, 17, 17,
-  90, 80, 70, 70, 90, 90, 90, 17,
-  90, 80, 70, 70, 90, 90, 90, 17,
-  90, 80, 70, 70, 90, 90, 90, 17
-};
-
-unsigned char v[] = {
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80
-};
-
-unsigned char ym[256];
-unsigned char uvm[64];
-typedef struct {
-  unsigned char y;
-  unsigned char yt;
-  unsigned char u;
-  unsigned char ut;
-  unsigned char v;
-  unsigned char vt;
-  unsigned char use;
-} COLOR_SEG_ELEMENT;
-
-/*
-COLOR_SEG_ELEMENT segmentation[]=
-{
-    { 60,4,80,17,80,10, 1},
-    { 40,4,15,10,80,10, 1},
-};
-*/
-
-COLOR_SEG_ELEMENT segmentation[] = {
-  { 79, 44, 92, 44, 237, 60, 1},
-};
-
-unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v,
-                         COLOR_SEG_ELEMENT sgm[],
-                         int c) {
-  COLOR_SEG_ELEMENT *s = sgm;
-  unsigned char m = 0;
-  int i;
-  for (i = 0; i < c; i++, s++)
-    m |= (abs(y - s->y) < s->yt &&
-          abs(u - s->u) < s->ut &&
-          abs(v - s->v) < s->vt ? 255 : 0);
-
-  return m;
-}
-int neighbors[256][8];
-int makeneighbors(void) {
-  int i, j;
-  for (i = 0; i < 256; i++) {
-    int r = (i >> 4), c = (i & 15);
-    int ni = 0;
-    for (j = 0; j < 8; j++)
-      neighbors[i][j] = i;
-    for (j = 0; j < 256; j++) {
-      int nr = (j >> 4), nc = (j & 15);
-      if (abs(nr - r) < 2 && abs(nc - c) < 2)
-        neighbors[i][ni++] = j;
-    }
-  }
-  return 0;
-}
-void grow_ymask(unsigned char *ym) {
-  unsigned char nym[256];
-  int i, j;
-
-  for (i = 0; i < 256; i++) {
-    nym[i] = ym[i];
-    for (j = 0; j < 8; j++) {
-      nym[i] |= ym[neighbors[i][j]];
-    }
-  }
-  for (i = 0; i < 256; i++)
-    ym[i] = nym[i];
-}
-
-void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v,
-                  unsigned char *ym, unsigned char *uvm,
-                  int yp, int uvp,
-                  COLOR_SEG_ELEMENT sgm[],
-                  int count) {
-  int r, c;
-  unsigned char *oym = ym;
-
-  memset(ym, 20, 256);
-  for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32)
-    for (c = 0; c < 8; c++) {
-      int y1 = y[c << 1];
-      int u1 = u[c];
-      int v1 = v[c];
-      int m = pixel_mask(y1, u1, v1, sgm, count);
-      uvm[c] = m;
-      ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count);
-      ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count);
-      ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count);
-      ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count);
-    }
-  grow_ymask(oym);
-}
-
-int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
-               unsigned char *ym) {
-  int i, j;
-  unsigned sad = 0;
-  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
-    for (j = 0; j < 16; j++)
-      if (ym[j])
-        sad += abs(src[j] - dst[j]);
-
-  return sad;
-}
-
-int compare_masks(unsigned char *sym, unsigned char *ym) {
-  int i, j;
-  unsigned sad = 0;
-  for (i = 0; i < 16; i++, sym += 16, ym += 16)
-    for (j = 0; j < 16; j++)
-      sad += (sym[j] != ym[j] ? 1 : 0);
-
-  return sad;
-}
-
-int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
-                 unsigned char *ym) {
-  int i, j;
-  unsigned sad = 0;
-  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
-    for (j = 0; j < 16; j++)
-      if (!ym[j])
-        sad += abs(src[j] - dst[j]);
-
-  return sad;
-}
-
-int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
-                         int yp, int uvp,
-                         unsigned char *dy, unsigned char *du, unsigned char *dv,
-                         int dyp, int duvp,
-                         COLOR_SEG_ELEMENT sgm[],
-                         int count,
-                         int *mi,
-                         int *mj,
-                         int *ui,
-                         int *uj,
-                         int *wm) {
-  int i, j;
-
-  unsigned char ym[256];
-  unsigned char uvm[64];
-  unsigned char dym[256];
-  unsigned char duvm[64];
-  unsigned int e = 0;
-  int beste = 256;
-  int bmi = -32, bmj = -32;
-  int bui = -32, buj = -32;
-  int beste1 = 256;
-  int bmi1 = -32, bmj1 = -32;
-  int bui1 = -32, buj1 = -32;
-  int obeste;
-
-  // first try finding best mask and then unmasked
-  beste = 0xffffffff;
-
-  // find best unmasked mv
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    unsigned char *duz = i / 2 * duvp + du;
-    unsigned char *dvz = i / 2 * duvp + dv;
-    for (j = -32; j < 32; j++) {
-      // 0,0  masked destination
-      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
-
-      e = unmasked_sad(y, yp, dyz + j, dyp, dym);
-
-      if (e < beste) {
-        bui = i;
-        buj = j;
-        beste = e;
-      }
-    }
-  }
-  // bui=0;buj=0;
-  // best mv masked destination
-  make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
-               dym, duvm, dyp, duvp, sgm, count);
-
-  obeste = beste;
-  beste = 0xffffffff;
-
-  // find best masked
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 32; j++) {
-      e = masked_sad(y, yp, dyz + j, dyp, dym);
-
-      if (e < beste) {
-        bmi = i;
-        bmj = j;
-        beste = e;
-      }
-    }
-  }
-  beste1 = beste + obeste;
-  bmi1 = bmi;
-  bmj1 = bmj;
-  bui1 = bui;
-  buj1 = buj;
-
-  beste = 0xffffffff;
-  // source mask
-  make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count);
-
-  // find best mask
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    unsigned char *duz = i / 2 * duvp + du;
-    unsigned char *dvz = i / 2 * duvp + dv;
-    for (j = -32; j < 32; j++) {
-      // 0,0  masked destination
-      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
-
-      e = compare_masks(ym, dym);
-
-      if (e < beste) {
-        bmi = i;
-        bmj = j;
-        beste = e;
-      }
-    }
-  }
-
-
-  // best mv masked destination
-  make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
-               dym, duvm, dyp, duvp, sgm, count);
-
-  obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym);
-
-  beste = 0xffffffff;
-
-  // find best unmasked mv
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 32; j++) {
-      e = unmasked_sad(y, yp, dyz + j, dyp, dym);
-
-      if (e < beste) {
-        bui = i;
-        buj = j;
-        beste = e;
-      }
-    }
-  }
-  beste += obeste;
-
-
-  if (beste < beste1) {
-    *mi = bmi;
-    *mj = bmj;
-    *ui = bui;
-    *uj = buj;
-    *wm = 1;
-  } else {
-    *mi = bmi1;
-    *mj = bmj1;
-    *ui = bui1;
-    *uj = buj1;
-    *wm = 0;
-
-  }
-  return 0;
-}
-
-int predict(unsigned char *src, int p, unsigned char *dst, int dp,
-            unsigned char *ym, unsigned char *prd) {
-  int i, j;
-  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16)
-    for (j = 0; j < 16; j++)
-      prd[j] = (ym[j] ? src[j] : dst[j]);
-  return 0;
-}
-
-int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
-                              int yp, int uvp,
-                              unsigned char *dy, unsigned char *du, unsigned char *dv,
-                              int dyp, int duvp,
-                              COLOR_SEG_ELEMENT sgm[],
-                              int count,
-                              int *mi,
-                              int *mj,
-                              int *ui,
-                              int *uj,
-                              int *wm) {
-  int i, j;
-
-  unsigned char ym[256];
-  unsigned char ym2[256];
-  unsigned char uvm[64];
-  unsigned char dym2[256];
-  unsigned char dym[256];
-  unsigned char duvm[64];
-  unsigned int e = 0;
-  int beste = 256;
-  int bmi = -32, bmj = -32;
-  int bui = -32, buj = -32;
-  int beste1 = 256;
-  int bmi1 = -32, bmj1 = -32;
-  int bui1 = -32, buj1 = -32;
-  int obeste;
-
-  // first try finding best mask and then unmasked
-  beste = 0xffffffff;
-
-#if 0
-  for (i = 0; i < 16; i++) {
-    unsigned char *dy = i * yp + y;
-    for (j = 0; j < 16; j++)
-      printf("%2x", dy[j]);
-    printf("\n");
-  }
-  printf("\n");
-
-  for (i = -32; i < 48; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 48; j++)
-      printf("%2x", dyz[j]);
-    printf("\n");
-  }
-#endif
-
-  // find best unmasked mv
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    unsigned char *duz = i / 2 * duvp + du;
-    unsigned char *dvz = i / 2 * duvp + dv;
-    for (j = -32; j < 32; j++) {
-      // 0,0  masked destination
-      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
-                        sgm[0].y, sgm[0].u, sgm[0].v,
-                        sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-      vp8_growmaskmb_sse3(dym, dym2);
-
-      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
-
-      if (e < beste) {
-        bui = i;
-        buj = j;
-        beste = e;
-      }
-    }
-  }
-  // bui=0;buj=0;
-  // best mv masked destination
-
-  vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
-                    dym, dyp, duvp,
-                    sgm[0].y, sgm[0].u, sgm[0].v,
-                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-  vp8_growmaskmb_sse3(dym, dym2);
-
-  obeste = beste;
-  beste = 0xffffffff;
-
-  // find best masked
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 32; j++) {
-      e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2);
-      if (e < beste) {
-        bmi = i;
-        bmj = j;
-        beste = e;
-      }
-    }
-  }
-  beste1 = beste + obeste;
-  bmi1 = bmi;
-  bmj1 = bmj;
-  bui1 = bui;
-  buj1 = buj;
-
-  // source mask
-  vp8_makemask_sse3(y, u, v,
-                    ym, yp, uvp,
-                    sgm[0].y, sgm[0].u, sgm[0].v,
-                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-  vp8_growmaskmb_sse3(ym, ym2);
-
-  // find best mask
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    unsigned char *duz = i / 2 * duvp + du;
-    unsigned char *dvz = i / 2 * duvp + dv;
-    for (j = -32; j < 32; j++) {
-      // 0,0  masked destination
-      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
-                        sgm[0].y, sgm[0].u, sgm[0].v,
-                        sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-      vp8_growmaskmb_sse3(dym, dym2);
-
-      e = compare_masks(ym2, dym2);
-
-      if (e < beste) {
-        bmi = i;
-        bmj = j;
-        beste = e;
-      }
-    }
-  }
-
-  vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
-                    dym, dyp, duvp,
-                    sgm[0].y, sgm[0].u, sgm[0].v,
-                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-  vp8_growmaskmb_sse3(dym, dym2);
-
-  obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2);
-
-  beste = 0xffffffff;
-
-  // find best unmasked mv
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 32; j++) {
-      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
-
-      if (e < beste) {
-        bui = i;
-        buj = j;
-        beste = e;
-      }
-    }
-  }
-  beste += obeste;
-
-  if (beste < beste1) {
-    *mi = bmi;
-    *mj = bmj;
-    *ui = bui;
-    *uj = buj;
-    *wm = 1;
-  } else {
-    *mi = bmi1;
-    *mj = bmj1;
-    *ui = bui1;
-    *uj = buj1;
-    *wm = 0;
-    beste = beste1;
-
-  }
-  return beste;
-}
-
-int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm,
-                int ymp, int uvmp,
-                unsigned char *yp, unsigned char *up, unsigned char *vp,
-                int ypp, int uvpp,
-                COLOR_SEG_ELEMENT sgm[],
-                int count,
-                int mi,
-                int mj,
-                int ui,
-                int uj,
-                int wm) {
-  int i, j;
-  unsigned char dym[256];
-  unsigned char dym2[256];
-  unsigned char duvm[64];
-  unsigned char *yu = ym, *uu = um, *vu = vm;
-
-  unsigned char *dym3 = dym2;
-
-  ym += mi * ymp + mj;
-  um += mi / 2 * uvmp + mj / 2;
-  vm += mi / 2 * uvmp + mj / 2;
-
-  yu += ui * ymp + uj;
-  uu += ui / 2 * uvmp + uj / 2;
-  vu += ui / 2 * uvmp + uj / 2;
-
-  // best mv masked destination
-  if (wm)
-    vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp,
-                      sgm[0].y, sgm[0].u, sgm[0].v,
-                      sgm[0].yt, sgm[0].ut, sgm[0].vt);
-  else
-    vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp,
-                      sgm[0].y, sgm[0].u, sgm[0].v,
-                      sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-  vp8_growmaskmb_sse3(dym, dym2);
-  vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3);
-  vp8_uv_from_y_mask(dym3, duvm);
-  vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm);
-  vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm);
-
-  return 0;
-}
-
-unsigned char f0p[1280 * 720 * 3 / 2];
-unsigned char f1p[1280 * 720 * 3 / 2];
-unsigned char prd[1280 * 720 * 3 / 2];
-unsigned char msk[1280 * 720 * 3 / 2];
-
-
-int mainz(int argc, char *argv[]) {
-
-  FILE *f = fopen(argv[1], "rb");
-  FILE *g = fopen(argv[2], "wb");
-  int w = atoi(argv[3]), h = atoi(argv[4]);
-  int y_stride = w, uv_stride = w / 2;
-  int r, c;
-  unsigned char *f0 = f0p, *f1 = f1p, *t;
-  unsigned char ym[256], uvm[64];
-  unsigned char ym2[256], uvm2[64];
-  unsigned char ym3[256], uvm3[64];
-  int a, b;
-
-  COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best;
-#if 0
-  makeneighbors();
-  COLOR_SEG_ELEMENT segmentation[] = {
-    { 60, 4, 80, 17, 80, 10, 1},
-    { 40, 4, 15, 10, 80, 10, 1},
-  };
-  make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1);
-
-  vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8,
-                    (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v,
-                    segmentation[0].yt, segmentation[0].ut, segmentation[0].vt);
-
-  vp8_growmaskmb_sse3(ym, ym3);
-
-  a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3);
-  b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3);
-
-  vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3);
-
-  vp8_uv_from_y_mask(ym3, uvm3);
-
-  return 4;
-#endif
-  makeneighbors();
-
-
-  memset(prd, 128, w * h * 3 / 2);
-
-  fread(f0, w * h * 3 / 2, 1, f);
-
-  while (!feof(f)) {
-    unsigned char *ys = f1, *yd = f0, *yp = prd;
-    unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h;
-    unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4;
-    fread(f1, w * h * 3 / 2, 1, f);
-
-    ys += 32 * y_stride;
-    yd += 32 * y_stride;
-    yp += 32 * y_stride;
-    us += 16 * uv_stride;
-    ud += 16 * uv_stride;
-    up += 16 * uv_stride;
-    vs += 16 * uv_stride;
-    vd += 16 * uv_stride;
-    vp += 16 * uv_stride;
-    for (r = 32; r < h - 32; r += 16,
-         ys += 16 * w, yd += 16 * w, yp += 16 * w,
-         us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride,
-         vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) {
-      for (c = 32; c < w - 32; c += 16) {
-        int mi, mj, ui, uj, wm;
-        int bmi, bmj, bui, buj, bwm;
-        unsigned char ym[256];
-
-        if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0)
-          bmi = bmj = bui = buj = bwm = 0;
-        else {
-          COLOR_SEG_ELEMENT cs[5];
-          int j;
-          unsigned int beste = 0xfffffff;
-          unsigned int bestj = 0;
-
-          // try color from last mb segmentation
-          cs[0] = last;
-
-          // try color segs from 4 pixels in mb recon as segmentation
-          cs[1].y = yd[c + y_stride + 1];
-          cs[1].u = ud[c / 2 + uv_stride];
-          cs[1].v = vd[c / 2 + uv_stride];
-          cs[1].yt = cs[1].ut = cs[1].vt = 20;
-          cs[2].y = yd[c + w + 14];
-          cs[2].u = ud[c / 2 + uv_stride + 7];
-          cs[2].v = vd[c / 2 + uv_stride + 7];
-          cs[2].yt = cs[2].ut = cs[2].vt = 20;
-          cs[3].y = yd[c + w * 14 + 1];
-          cs[3].u = ud[c / 2 + uv_stride * 7];
-          cs[3].v = vd[c / 2 + uv_stride * 7];
-          cs[3].yt = cs[3].ut = cs[3].vt = 20;
-          cs[4].y = yd[c + w * 14 + 14];
-          cs[4].u = ud[c / 2 + uv_stride * 7 + 7];
-          cs[4].v = vd[c / 2 + uv_stride * 7 + 7];
-          cs[4].yt = cs[4].ut = cs[4].vt = 20;
-
-          for (j = 0; j < 5; j++) {
-            int e;
-
-            e = fast_masked_motion_search(
-                  ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride,
-                  yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride,
-                  &cs[j], 1, &mi, &mj, &ui, &uj, &wm);
-
-            if (e < beste) {
-              bmi = mi;
-              bmj = mj;
-              bui = ui;
-              buj = uj, bwm = wm;
-              bestj = j;
-              beste = e;
-            }
-          }
-          best = cs[bestj];
-          // best = segmentation[0];
-          last = best;
-        }
-        predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride,
-                    yp + c, up + c / 2, vp + c / 2, w, uv_stride,
-                    &best, 1, bmi, bmj, bui, buj, bwm);
-
-      }
-    }
-    fwrite(prd, w * h * 3 / 2, 1, g);
-    t = f0;
-    f0 = f1;
-    f1 = t;
-
-  }
-  fclose(f);
-  fclose(g);
-  return 0;
-}
diff --git a/libvpx/vp9/common/vp9_mbpitch.c b/libvpx/vp9/common/vp9_mbpitch.c
deleted file mode 100644
index 3cf37ff..0000000
--- a/libvpx/vp9/common/vp9_mbpitch.c
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_blockd.h"
-
-void vp9_setup_block_dptrs(MACROBLOCKD *mb,
-                           int subsampling_x, int subsampling_y) {
-  int i;
-
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    mb->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
-    mb->plane[i].subsampling_x = i ? subsampling_x : 0;
-    mb->plane[i].subsampling_y = i ? subsampling_y : 0;
-  }
-#if CONFIG_ALPHA
-  // TODO(jkoleszar): Using the Y w/h for now
-  mb->plane[3].subsampling_x = 0;
-  mb->plane[3].subsampling_y = 0;
-#endif
-}
diff --git a/libvpx/vp9/common/vp9_modecont.c b/libvpx/vp9/common/vp9_modecont.c
deleted file mode 100644
index 5d92cfa..0000000
--- a/libvpx/vp9/common/vp9_modecont.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_modecont.h"
-
-const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS]
-                                           [VP9_INTER_MODES - 1] = {
-  {2,       173,   34},  // 0 = both zero mv
-  {7,       145,   85},  // 1 = one zero mv + one a predicted mv
-  {7,       166,   63},  // 2 = two predicted mvs
-  {7,       94,    66},  // 3 = one predicted/zero and one new mv
-  {8,       64,    46},  // 4 = two new mvs
-  {17,      81,    31},  // 5 = one intra neighbour + x
-  {25,      29,    30},  // 6 = two intra neighbours
-};
diff --git a/libvpx/vp9/common/vp9_modecont.h b/libvpx/vp9/common/vp9_modecont.h
deleted file mode 100644
index 3ec6079..0000000
--- a/libvpx/vp9/common/vp9_modecont.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_MODECONT_H_
-#define VP9_COMMON_VP9_MODECONT_H_
-
-#include "vp9/common/vp9_entropy.h"
-
-extern const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS]
-                                                  [VP9_INTER_MODES - 1];
-
-#endif  // VP9_COMMON_VP9_MODECONT_H_
diff --git a/libvpx/vp9/common/vp9_modecontext.c b/libvpx/vp9/common/vp9_modecontext.c
deleted file mode 100644
index a79ab2a..0000000
--- a/libvpx/vp9/common/vp9_modecontext.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_entropymode.h"
-
-const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES]
-                                         [VP9_INTRA_MODES]
-                                         [VP9_INTRA_MODES - 1] = {
-  { /* above = dc */
-    { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
-    {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
-    {  73,  32,  19, 187, 222, 215,  46,  34, 100 } /* left = h */,
-    {  91,  30,  32, 116, 121, 186,  93,  86,  94 } /* left = d45 */,
-    {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
-    {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
-    {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
-    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
-    {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
-    {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
-  }, { /* above = v */
-    {  63,  36, 126, 146, 123, 158,  60,  90,  96 } /* left = dc */,
-    {  43,  46, 168, 134, 107, 128,  69, 142,  92 } /* left = v */,
-    {  44,  29,  68, 159, 201, 177,  50,  57,  77 } /* left = h */,
-    {  58,  38,  76, 114,  97, 172,  78, 133,  92 } /* left = d45 */,
-    {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
-    {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
-    {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
-    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
-    {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
-    {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
-  }, { /* above = h */
-    {  82,  26,  26, 171, 208, 204,  44,  32, 105 } /* left = dc */,
-    {  55,  44,  68, 166, 179, 192,  57,  57, 108 } /* left = v */,
-    {  42,  26,  11, 199, 241, 228,  23,  15,  85 } /* left = h */,
-    {  68,  42,  19, 131, 160, 199,  55,  52,  83 } /* left = d45 */,
-    {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
-    {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
-    {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
-    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
-    {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
-    {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
-  }, { /* above = d45 */
-    { 103,  26,  36, 129, 132, 201,  83,  80,  93 } /* left = dc */,
-    {  59,  38,  83, 112, 103, 162,  98, 136,  90 } /* left = v */,
-    {  62,  30,  23, 158, 200, 207,  59,  57,  50 } /* left = h */,
-    {  67,  30,  29,  84,  86, 191, 102,  91,  59 } /* left = d45 */,
-    {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
-    {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
-    {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
-    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
-    {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
-    {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
-  }, { /* above = d135 */
-    {  69,  23,  29, 128,  83, 199,  46,  44, 101 } /* left = dc */,
-    {  53,  40,  55, 139,  69, 183,  61,  80, 110 } /* left = v */,
-    {  40,  29,  19, 161, 180, 207,  43,  24,  91 } /* left = h */,
-    {  60,  34,  19, 105,  61, 198,  53,  64,  89 } /* left = d45 */,
-    {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
-    {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
-    {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
-    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
-    {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
-    {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
-  }, { /* above = d117 */
-    {  64,  19,  37, 156,  66, 138,  49,  95, 133 } /* left = dc */,
-    {  46,  27,  80, 150,  55, 124,  55, 121, 135 } /* left = v */,
-    {  36,  23,  27, 165, 149, 166,  54,  64, 118 } /* left = h */,
-    {  53,  21,  36, 131,  63, 163,  60, 109,  81 } /* left = d45 */,
-    {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
-    {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
-    {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
-    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
-    {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
-    {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
-  }, { /* above = d153 */
-    {  75,  17,  22, 136, 138, 185,  32,  34, 166 } /* left = dc */,
-    {  56,  39,  58, 133, 117, 173,  48,  53, 187 } /* left = v */,
-    {  35,  21,  12, 161, 212, 207,  20,  23, 145 } /* left = h */,
-    {  56,  29,  19, 117, 109, 181,  55,  68, 112 } /* left = d45 */,
-    {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
-    {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
-    {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
-    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
-    {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
-    {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
-  }, { /* above = d27 */
-    {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
-    {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
-    {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
-    {  68,  36,  17, 106, 102, 206,  59,  74,  74 } /* left = d45 */,
-    {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
-    {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
-    {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
-    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
-    {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
-    {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
-  }, { /* above = d63 */
-    {  78,  23,  39, 111, 117, 170,  74, 124,  94 } /* left = dc */,
-    {  48,  34,  86, 101,  92, 146,  78, 179, 134 } /* left = v */,
-    {  47,  22,  24, 138, 187, 178,  68,  69,  59 } /* left = h */,
-    {  56,  25,  33, 105, 112, 187,  95, 177, 129 } /* left = d45 */,
-    {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
-    {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
-    {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
-    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
-    {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
-    {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
-  }, { /* above = tm */
-    {  65,  70,  60, 155, 159, 199,  61,  60,  81 } /* left = dc */,
-    {  44,  78, 115, 132, 119, 173,  71, 112,  93 } /* left = v */,
-    {  39,  38,  21, 184, 227, 206,  42,  32,  64 } /* left = h */,
-    {  58,  47,  36, 124, 137, 193,  80,  82,  78 } /* left = d45 */,
-    {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
-    {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
-    {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
-    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
-    {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
-    {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
-  }
-};
diff --git a/libvpx/vp9/common/vp9_mv.h b/libvpx/vp9/common/vp9_mv.h
index a1eef46..a095258 100644
--- a/libvpx/vp9/common/vp9_mv.h
+++ b/libvpx/vp9/common/vp9_mv.h
@@ -23,14 +23,9 @@ typedef union int_mv {
   MV as_mv;
 } int_mv; /* facilitates faster equality tests and copies */
 
-struct mv32 {
+typedef struct {
   int32_t row;
   int32_t col;
-};
-
-typedef union int_mv32 {
-  uint64_t    as_int;
-  struct mv32 as_mv;
-} int_mv32; /* facilitates faster equality tests and copies */
+} MV32;
 
 #endif  // VP9_COMMON_VP9_MV_H_
diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c
index 78fb2f0..ae009b0 100644
--- a/libvpx/vp9/common/vp9_mvref_common.c
+++ b/libvpx/vp9/common/vp9_mvref_common.c
@@ -11,7 +11,7 @@
 #include "vp9/common/vp9_mvref_common.h"
 
 #define MVREF_NEIGHBOURS 8
-static int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
+static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
   // SB4X4
   {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
   // SB4X8
@@ -147,10 +147,9 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
   int_mv c2_refmv;
   MV_REFERENCE_FRAME c_ref_frame;
   MV_REFERENCE_FRAME c2_ref_frame;
-  int candidate_scores[MAX_MV_REF_CANDIDATES];
+  int candidate_scores[MAX_MV_REF_CANDIDATES] = { 0 };
   int refmv_count = 0;
-  int split_count = 0;
-  int (*mv_ref_search)[2];
+  const int (*mv_ref_search)[2] = mv_ref_blocks[mbmi->sb_type];
   const int mi_col = get_mi_col(xd);
   const int mi_row = get_mi_row(xd);
   int intra_count = 0;
@@ -160,9 +159,7 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
 
   // Blank the reference vector lists and other local structures.
   vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
-  vpx_memset(candidate_scores, 0, sizeof(candidate_scores));
 
-  mv_ref_search = mv_ref_blocks[mbmi->sb_type];
   if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
     x_idx = block_idx & 1;
     y_idx = block_idx >> 1;
@@ -193,8 +190,6 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
         add_candidate_mv(mv_ref_list, candidate_scores,
                          &refmv_count, c_refmv, 16);
       }
-      split_count += (candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 &&
-                      candidate_mi->mbmi.ref_frame[0] != INTRA_FRAME);
 
       // Count number of neihgbours coded intra and zeromv
       intra_count += (candidate_mi->mbmi.mode < NEARESTMV);
diff --git a/libvpx/vp9/common/vp9_onyx.h b/libvpx/vp9/common/vp9_onyx.h
index b85b889..152046f 100644
--- a/libvpx/vp9/common/vp9_onyx.h
+++ b/libvpx/vp9/common/vp9_onyx.h
@@ -22,7 +22,7 @@ extern "C"
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_ppflags.h"
 
-#define MAX_MB_SEGMENTS 8
+#define MAX_SEGMENTS 8
 
   typedef int *VP9_PTR;
 
@@ -64,41 +64,13 @@ extern "C"
     FRAMEFLAGS_ALTREF = 4,
   } FRAMETYPE_FLAGS;
 
-
-#include <assert.h>
-  static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
-    switch (mode) {
-      case    NORMAL:
-        *hr = 1;
-        *hs = 1;
-        break;
-      case    FOURFIVE:
-        *hr = 4;
-        *hs = 5;
-        break;
-      case    THREEFIVE:
-        *hr = 3;
-        *hs = 5;
-        break;
-      case    ONETWO:
-        *hr = 1;
-        *hs = 2;
-        break;
-      default:
-        *hr = 1;
-        *hs = 1;
-        assert(0);
-        break;
-    }
-  }
-
   typedef struct {
     int version;  // 4 versions of bitstream defined:
                   //   0 - best quality/slowest decode,
                   //   3 - lowest quality/fastest decode
     int width;  // width of data passed to the compressor
     int height;  // height of data passed to the compressor
-    double frame_rate;       // set to passed in framerate
+    double framerate;       // set to passed in framerate
     int64_t target_bandwidth;    // bandwidth to be used in kilobits per second
 
     int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0
@@ -228,9 +200,9 @@ extern "C"
 
   int vp9_set_roimap(VP9_PTR comp, unsigned char *map,
                      unsigned int rows, unsigned int cols,
-                     int delta_q[MAX_MB_SEGMENTS],
-                     int delta_lf[MAX_MB_SEGMENTS],
-                     unsigned int threshold[MAX_MB_SEGMENTS]);
+                     int delta_q[MAX_SEGMENTS],
+                     int delta_lf[MAX_SEGMENTS],
+                     unsigned int threshold[MAX_SEGMENTS]);
 
   int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
                          unsigned int rows, unsigned int cols);
diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h
index 0d8b0f4..f31f24b 100644
--- a/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libvpx/vp9/common/vp9_onyxc_int.h
@@ -24,87 +24,57 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 
-/* Create/destroy static data structures. */
-
-// Define the number of candidate reference buffers.
-#define NUM_REF_FRAMES 8
-#define NUM_REF_FRAMES_LG2 3
-
 #define ALLOWED_REFS_PER_FRAME 3
 
+#define NUM_REF_FRAMES_LOG2 3
+#define NUM_REF_FRAMES (1 << NUM_REF_FRAMES_LOG2)
+
 // 1 scratch frame for the new frame, 3 for scaled references on the encoder
 // TODO(jkoleszar): These 3 extra references could probably come from the
 // normal reference pool.
 #define NUM_YV12_BUFFERS (NUM_REF_FRAMES + 4)
 
-#define NUM_FRAME_CONTEXTS_LG2 2
-#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LG2)
-
-#define MAX_LAG_BUFFERS 25
+#define NUM_FRAME_CONTEXTS_LOG2 2
+#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LOG2)
 
 typedef struct frame_contexts {
   vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1];
   vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
   vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
                          [PARTITION_TYPES - 1];
-
-  nmv_context nmvc;
-  nmv_context pre_nmvc;
-  /* interframe intra mode probs */
-  vp9_prob pre_y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1];
-  vp9_prob pre_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
-  vp9_prob pre_partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
-  /* interframe intra mode probs */
-  unsigned int y_mode_counts[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES];
-  unsigned int uv_mode_counts[VP9_INTRA_MODES][VP9_INTRA_MODES];
-  unsigned int partition_counts[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
-
   vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
-  vp9_coeff_probs_model pre_coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
-  vp9_coeff_count_model coef_counts[TX_SIZE_MAX_SB][BLOCK_TYPES];
-  unsigned int eob_branch_counts[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES]
-                                [COEF_BANDS][PREV_COEF_CONTEXTS];
-
-  nmv_context_counts NMVcount;
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                  [VP9_SWITCHABLE_FILTERS - 1];
-  vp9_prob pre_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
-      [VP9_SWITCHABLE_FILTERS - 1];
-  unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
-                                      [VP9_SWITCHABLE_FILTERS];
-
   vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
-  vp9_prob pre_inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
-  unsigned int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
-
   vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
   vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
   vp9_prob single_ref_prob[REF_CONTEXTS][2];
   vp9_prob comp_ref_prob[REF_CONTEXTS];
-  vp9_prob pre_intra_inter_prob[INTRA_INTER_CONTEXTS];
-  vp9_prob pre_comp_inter_prob[COMP_INTER_CONTEXTS];
-  vp9_prob pre_single_ref_prob[REF_CONTEXTS][2];
-  vp9_prob pre_comp_ref_prob[REF_CONTEXTS];
-  unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];
-  unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];
-  unsigned int single_ref_count[REF_CONTEXTS][2][2];
-  unsigned int comp_ref_count[REF_CONTEXTS][2];
-
-  vp9_prob tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
-  vp9_prob tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
-  vp9_prob tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
-  vp9_prob pre_tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
-  vp9_prob pre_tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
-  vp9_prob pre_tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
-  unsigned int tx_count_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];
-  unsigned int tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
-  unsigned int tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
-
+  struct tx_probs tx_probs;
   vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
-  vp9_prob pre_mbskip_probs[MBSKIP_CONTEXTS];
-  unsigned int mbskip_count[MBSKIP_CONTEXTS][2];
+  nmv_context nmvc;
 } FRAME_CONTEXT;
 
+typedef struct {
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES];
+  unsigned int uv_mode[VP9_INTRA_MODES][VP9_INTRA_MODES];
+  unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
+  vp9_coeff_count_model coef[TX_SIZE_MAX_SB][BLOCK_TYPES];
+  unsigned int eob_branch[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES]
+                         [COEF_BANDS][PREV_COEF_CONTEXTS];
+  unsigned int switchable_interp[VP9_SWITCHABLE_FILTERS + 1]
+                                [VP9_SWITCHABLE_FILTERS];
+  unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
+  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+  unsigned int single_ref[REF_CONTEXTS][2][2];
+  unsigned int comp_ref[REF_CONTEXTS][2];
+  struct tx_counts tx;
+  unsigned int mbskip[MBSKIP_CONTEXTS][2];
+  nmv_context_counts mv;
+} FRAME_COUNTS;
+
+
 typedef enum {
   SINGLE_PREDICTION_ONLY = 0,
   COMP_PREDICTION_ONLY   = 1,
@@ -112,22 +82,13 @@ typedef enum {
   NB_PREDICTION_TYPES    = 3,
 } COMPPREDMODE_TYPE;
 
-typedef enum {
-  ONLY_4X4            = 0,
-  ALLOW_8X8           = 1,
-  ALLOW_16X16         = 2,
-  ALLOW_32X32         = 3,
-  TX_MODE_SELECT      = 4,
-  NB_TXFM_MODES       = 5,
-} TXFM_MODE;
-
 typedef struct VP9Common {
   struct vpx_internal_error_info  error;
 
-  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][2]);
-  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][2]);
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
 #if CONFIG_ALPHA
-  DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][2]);
+  DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]);
 #endif
 
   int width;
@@ -143,8 +104,6 @@ typedef struct VP9Common {
   int subsampling_x;
   int subsampling_y;
 
-  YUV_TYPE clr_type;
-
   YV12_BUFFER_CONFIG *frame_to_show;
 
   YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
@@ -159,10 +118,7 @@ typedef struct VP9Common {
   struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME];
   int new_fb_idx;
 
-
   YV12_BUFFER_CONFIG post_proc_buffer;
-  YV12_BUFFER_CONFIG temp_scale_frame;
-
 
   FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
   FRAME_TYPE frame_type;
@@ -187,7 +143,7 @@ typedef struct VP9Common {
   int mode_info_stride;
 
   /* profile settings */
-  TXFM_MODE txfm_mode;
+  TX_MODE tx_mode;
 
   int base_qindex;
   int last_kf_gf_q;  /* Q used on the last GF or KF */
@@ -200,9 +156,6 @@ typedef struct VP9Common {
   int a_ac_delta_q;
 #endif
 
-  unsigned int frames_since_golden;
-  unsigned int frames_till_alt_ref_frame;
-
   /* We allocate a MODE_INFO struct for each macroblock, together with
      an extra row on top and column on the left to simplify prediction. */
 
@@ -219,10 +172,6 @@ typedef struct VP9Common {
 
   loop_filter_info_n lf_info;
 
-  int filter_level;
-  int last_sharpness_level;
-  int sharpness_level;
-
   int refresh_frame_context;    /* Two state 0 = NO, 1 = YES */
 
   int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
@@ -235,17 +184,6 @@ typedef struct VP9Common {
   PARTITION_CONTEXT *above_seg_context;
   PARTITION_CONTEXT left_seg_context[8];
 
-  /* keyframe block modes are predicted by their above, left neighbors */
-
-  vp9_prob kf_y_mode_prob[VP9_INTRA_MODES]
-                         [VP9_INTRA_MODES]
-                         [VP9_INTRA_MODES - 1];
-  vp9_prob kf_uv_mode_prob[VP9_INTRA_MODES] [VP9_INTRA_MODES - 1];
-
-  // Context probabilities when using predictive coding of segment id
-  vp9_prob segment_pred_probs[PREDICTION_PROBS];
-  unsigned char temporal_update;
-
   // Context probabilities for reference frame prediction
   int allow_comp_inter_inter;
   MV_REFERENCE_FRAME comp_fixed_ref;
@@ -255,14 +193,11 @@ typedef struct VP9Common {
   FRAME_CONTEXT fc;  /* this frame entropy */
   FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS];
   unsigned int  frame_context_idx; /* Context to use/update */
+  FRAME_COUNTS counts;
 
   unsigned int current_video_frame;
-  int near_boffset[3];
   int version;
 
-  double bitrate;
-  double framerate;
-
 #if CONFIG_POSTPROC
   struct postproc_state  postproc_state;
 #endif
@@ -270,10 +205,9 @@ typedef struct VP9Common {
   int error_resilient_mode;
   int frame_parallel_decoding_mode;
 
-  int tile_columns, log2_tile_columns;
-  int cur_tile_mi_col_start, cur_tile_mi_col_end, cur_tile_col_idx;
-  int tile_rows, log2_tile_rows;
-  int cur_tile_mi_row_start, cur_tile_mi_row_end, cur_tile_row_idx;
+  int log2_tile_cols, log2_tile_rows;
+  int cur_tile_mi_col_start, cur_tile_mi_col_end;
+  int cur_tile_mi_row_start, cur_tile_mi_row_end;
 } VP9_COMMON;
 
 static int get_free_fb(VP9_COMMON *cm) {
@@ -296,15 +230,14 @@ static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
   buf[new_idx]++;
 }
 
-static int mi_cols_aligned_to_sb(VP9_COMMON *cm) {
-  return 2 * ((cm->mb_cols + 3) & ~3);
+static int mi_cols_aligned_to_sb(int n_mis) {
+  return ALIGN_POWER_OF_TWO(n_mis, LOG2_MI_BLOCK_SIZE);
 }
 
-static INLINE void set_partition_seg_context(VP9_COMMON *cm,
-                                             MACROBLOCKD *xd,
+static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
                                              int mi_row, int mi_col) {
   xd->above_seg_context = cm->above_seg_context + mi_col;
-  xd->left_seg_context  = cm->left_seg_context + (mi_row & MI_MASK);
+  xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
 }
 
 static int check_bsize_coverage(VP9_COMMON *cm, MACROBLOCKD *xd,
diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c
index 4282ddd..1157fbb 100644
--- a/libvpx/vp9/common/vp9_postproc.c
+++ b/libvpx/vp9/common/vp9_postproc.c
@@ -411,7 +411,7 @@ static void fillrd(struct postproc_state *state, int q, int a) {
 
     }
 
-    for (next = next; next < 256; next++)
+    for (; next < 256; next++)
       char_dist[next] = 0;
   }
 
@@ -630,9 +630,11 @@ static void constrain_line(int x0, int *x1, int y0, int *y1,
   }
 }
 
-int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
+int vp9_post_proc_frame(struct VP9Common *oci,
+                        struct loopfilter *lf,
+                        YV12_BUFFER_CONFIG *dest,
                         vp9_ppflags_t *ppflags) {
-  int q = oci->filter_level * 10 / 6;
+  int q = lf->filter_level * 10 / 6;
   int flags = ppflags->post_proc_flag;
   int deblock_level = ppflags->deblocking_level;
   int noise_level = ppflags->noise_level;
@@ -758,7 +760,7 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
   if (flags & VP9D_DEBUG_TXT_RATE_INFO) {
     char message[512];
     snprintf(message, sizeof(message),
-             "Bitrate: %10.2f frame_rate: %10.2f ",
+             "Bitrate: %10.2f framerate: %10.2f ",
              oci->bitrate, oci->framerate);
     vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
                   oci->post_proc_buffer.y_stride);
@@ -936,9 +938,9 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
             for (bx = 0; bx < 16; bx += 4) {
               if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode))
                   || (ppflags->display_mb_modes_flag & I4X4_PRED)) {
-                Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0];
-                U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1];
-                V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2];
+                Y = B_PREDICTION_MODE_colors[bmi->as_mode][0];
+                U = B_PREDICTION_MODE_colors[bmi->as_mode][1];
+                V = B_PREDICTION_MODE_colors[bmi->as_mode][2];
 
                 vp9_blend_b(yl + bx, ul + (bx >> 1), vl + (bx >> 1), Y, U, V,
                     0xc000, y_stride);
diff --git a/libvpx/vp9/common/vp9_postproc.h b/libvpx/vp9/common/vp9_postproc.h
index 2c0d333..a814e39 100644
--- a/libvpx/vp9/common/vp9_postproc.h
+++ b/libvpx/vp9/common/vp9_postproc.h
@@ -26,8 +26,8 @@ struct postproc_state {
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
 
-int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,
-                        vp9_ppflags_t *flags);
+int vp9_post_proc_frame(struct VP9Common *oci, struct loopfilter *lf,
+                        YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
 
 void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
 
diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c
index 17da4f2..e8bcdea 100644
--- a/libvpx/vp9/common/vp9_pred_common.c
+++ b/libvpx/vp9/common/vp9_pred_common.c
@@ -16,505 +16,425 @@
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_treecoder.h"
 
-// TBD prediction functions for various bitstream signals
-
 // Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
-                                   const MACROBLOCKD *const xd,
-                                   PRED_ID pred_id) {
+unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  // left
+  const int left_mv_pred = is_inter_mode(left_mbmi->mode);
+  const int left_interp = left_in_image && left_mv_pred ?
+          vp9_switchable_interp_map[left_mbmi->interp_filter] :
+          VP9_SWITCHABLE_FILTERS;
+
+  // above
+  const int above_mv_pred = is_inter_mode(above_mbmi->mode);
+  const int above_interp = above_in_image && above_mv_pred ?
+          vp9_switchable_interp_map[above_mbmi->interp_filter] :
+          VP9_SWITCHABLE_FILTERS;
+
+  assert(left_interp != -1);
+  assert(above_interp != -1);
+
+  if (left_interp == above_interp)
+    return left_interp;
+  else if (left_interp == VP9_SWITCHABLE_FILTERS &&
+           above_interp != VP9_SWITCHABLE_FILTERS)
+    return above_interp;
+  else if (left_interp != VP9_SWITCHABLE_FILTERS &&
+           above_interp == VP9_SWITCHABLE_FILTERS)
+    return left_interp;
+  else
+    return VP9_SWITCHABLE_FILTERS;
+}
+// Returns a context number for the given MB prediction signal
+unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
   int pred_context;
   const MODE_INFO *const mi = xd->mode_info_context;
-  const MODE_INFO *const above_mi = mi - cm->mode_info_stride;
-  const MODE_INFO *const left_mi = mi - 1;
-  const int left_in_image = xd->left_available && left_mi->mbmi.mb_in_image;
-  const int above_in_image = xd->up_available && above_mi->mbmi.mb_in_image;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
-  switch (pred_id) {
-    case PRED_SEG_ID:
-      pred_context = above_mi->mbmi.seg_id_predicted;
-      if (xd->left_available)
-        pred_context += left_mi->mbmi.seg_id_predicted;
-      break;
-
-    case PRED_MBSKIP:
-      pred_context = above_mi->mbmi.mb_skip_coeff;
-      if (xd->left_available)
-        pred_context += left_mi->mbmi.mb_skip_coeff;
-      break;
-
-    case PRED_SWITCHABLE_INTERP: {
-      // left
-      const int left_mv_pred = is_inter_mode(left_mi->mbmi.mode);
-      const int left_interp = left_in_image && left_mv_pred ?
-                    vp9_switchable_interp_map[left_mi->mbmi.interp_filter] :
-                    VP9_SWITCHABLE_FILTERS;
-
-      // above
-      const int above_mv_pred = is_inter_mode(above_mi->mbmi.mode);
-      const int above_interp = above_in_image && above_mv_pred ?
-                    vp9_switchable_interp_map[above_mi->mbmi.interp_filter] :
-                    VP9_SWITCHABLE_FILTERS;
-
-      assert(left_interp != -1);
-      assert(above_interp != -1);
-
-      if (left_interp == above_interp)
-        pred_context = left_interp;
-      else if (left_interp == VP9_SWITCHABLE_FILTERS &&
-               above_interp != VP9_SWITCHABLE_FILTERS)
-         pred_context = above_interp;
-      else if (left_interp != VP9_SWITCHABLE_FILTERS &&
-               above_interp == VP9_SWITCHABLE_FILTERS)
-        pred_context = left_interp;
-      else
-        pred_context = VP9_SWITCHABLE_FILTERS;
-
-      break;
+  if (above_in_image && left_in_image) {  // both edges available
+    if (left_mbmi->ref_frame[0] == INTRA_FRAME &&
+        above_mbmi->ref_frame[0] == INTRA_FRAME) {  // intra/intra (3)
+      pred_context = 3;
+    } else {  // intra/inter (1) or inter/inter (0)
+      pred_context = left_mbmi->ref_frame[0] == INTRA_FRAME ||
+                     above_mbmi->ref_frame[0] == INTRA_FRAME;
     }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
 
-    case PRED_INTRA_INTER: {
-      if (above_in_image && left_in_image) {  // both edges available
-        if (left_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
-            above_mi->mbmi.ref_frame[0] == INTRA_FRAME) {  // intra/intra (3)
-          pred_context = 3;
-        } else {  // intra/inter (1) or inter/inter (0)
-          pred_context = left_mi->mbmi.ref_frame[0] == INTRA_FRAME ||
-                         above_mi->mbmi.ref_frame[0] == INTRA_FRAME;
-        }
-      } else if (above_in_image || left_in_image) {  // one edge available
-        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
-
-        // inter: 0, intra: 2
-        pred_context = 2 * (edge->mbmi.ref_frame[0] == INTRA_FRAME);
-      } else {
-        pred_context = 0;
-      }
-      assert(pred_context >= 0 && pred_context < INTRA_INTER_CONTEXTS);
-      break;
-    }
+    // inter: 0, intra: 2
+    pred_context = 2 * (edge_mbmi->ref_frame[0] == INTRA_FRAME);
+  } else {
+    pred_context = 0;
+  }
+  assert(pred_context >= 0 && pred_context < INTRA_INTER_CONTEXTS);
+  return pred_context;
+}
+// Returns a context number for the given MB prediction signal
+unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
+                                                    const MACROBLOCKD *xd) {
+  int pred_context;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (above_in_image && left_in_image) {  // both edges available
+    if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
+        left_mbmi->ref_frame[1] <= INTRA_FRAME)
+      // neither edge uses comp pred (0/1)
+      pred_context = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^
+                     (left_mbmi->ref_frame[0] == cm->comp_fixed_ref);
+    else if (above_mbmi->ref_frame[1] <= INTRA_FRAME)
+      // one of two edges uses comp pred (2/3)
+      pred_context = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
+                          above_mbmi->ref_frame[0] == INTRA_FRAME);
+    else if (left_mbmi->ref_frame[1] <= INTRA_FRAME)
+      // one of two edges uses comp pred (2/3)
+      pred_context = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
+                          left_mbmi->ref_frame[0] == INTRA_FRAME);
+    else  // both edges use comp pred (4)
+      pred_context = 4;
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+      // edge does not use comp pred (0/1)
+      pred_context = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref;
+    else
+      // edge uses comp pred (3)
+      pred_context = 3;
+  } else {  // no edges available (1)
+    pred_context = 1;
+  }
+  assert(pred_context >= 0 && pred_context < COMP_INTER_CONTEXTS);
+  return pred_context;
+}
 
-    case PRED_COMP_INTER_INTER: {
-      if (above_in_image && left_in_image) {  // both edges available
-        if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME &&
-            left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
-          // neither edge uses comp pred (0/1)
-          pred_context = ((above_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref) ^
-                          (left_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref));
-        } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
-          // one of two edges uses comp pred (2/3)
-          pred_context = 2 +
-              (above_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref ||
-               above_mi->mbmi.ref_frame[0] == INTRA_FRAME);
-        } else if (left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
-          // one of two edges uses comp pred (2/3)
-          pred_context = 2 +
-              (left_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref ||
-               left_mi->mbmi.ref_frame[0] == INTRA_FRAME);
-        } else {  // both edges use comp pred (4)
+// Returns a context number for the given MB prediction signal
+unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
+                                              const MACROBLOCKD *xd) {
+  int pred_context;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+  const int var_ref_idx = !fix_ref_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
+        left_mbmi->ref_frame[0] == INTRA_FRAME) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
+               left_mbmi->ref_frame[0] == INTRA_FRAME) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
+                                          left_mbmi : above_mbmi;
+
+      if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+      else  // comp pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[var_ref_idx]
+                                    != cm->comp_var_ref[1]);
+    } else {  // inter/inter
+      int l_sg = left_mbmi->ref_frame[1] <= INTRA_FRAME;
+      int a_sg = above_mbmi->ref_frame[1] <= INTRA_FRAME;
+      MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
+                                     : above_mbmi->ref_frame[var_ref_idx];
+      MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
+                                     : left_mbmi->ref_frame[var_ref_idx];
+
+      if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) {
+        pred_context = 0;
+      } else if (l_sg && a_sg) {  // single/single
+        if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) ||
+            (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0]))
           pred_context = 4;
-        }
-      } else if (above_in_image || left_in_image) {  // one edge available
-        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
-
-        if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
-          // edge does not use comp pred (0/1)
-          pred_context = edge->mbmi.ref_frame[0] == cm->comp_fixed_ref;
-        } else {  // edge uses comp pred (3)
+        else if (vrfa == vrfl)
           pred_context = 3;
-        }
-      } else {  // no edges available (1)
-        pred_context = 1;
-      }
-      assert(pred_context >= 0 && pred_context < COMP_INTER_CONTEXTS);
-      break;
-    }
-
-    case PRED_COMP_REF_P: {
-      const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
-      const int var_ref_idx = !fix_ref_idx;
-
-      if (above_in_image && left_in_image) {  // both edges available
-        if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
-            left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {  // intra/intra (2)
-          pred_context = 2;
-        } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME ||
-                   left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {  // intra/inter
-          const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ?
-                                  left_mi : above_mi;
-
-          if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {  // single pred (1/3)
-            pred_context = 1 +
-                2 * edge->mbmi.ref_frame[0] != cm->comp_var_ref[1];
-          } else {  // comp pred (1/3)
-            pred_context = 1 +
-                2 * edge->mbmi.ref_frame[var_ref_idx] != cm->comp_var_ref[1];
-          }
-        } else {  // inter/inter
-          int l_sg = left_mi->mbmi.ref_frame[1] <= INTRA_FRAME;
-          int a_sg = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME;
-          MV_REFERENCE_FRAME vrfa = a_sg ? above_mi->mbmi.ref_frame[0] :
-              above_mi->mbmi.ref_frame[var_ref_idx];
-          MV_REFERENCE_FRAME vrfl = l_sg ? left_mi->mbmi.ref_frame[0] :
-              left_mi->mbmi.ref_frame[var_ref_idx];
-
-          if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) {
-            pred_context = 0;
-          } else if (l_sg && a_sg) {  // single/single
-            if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) ||
-                (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0])) {
-              pred_context = 4;
-            } else if (vrfa == vrfl) {
-              pred_context = 3;
-            } else {
-              pred_context = 1;
-            }
-          } else if (l_sg || a_sg) {  // single/comp
-            MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
-            MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
-
-            if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1]) {
-              pred_context = 1;
-            } else if (rfs == cm->comp_var_ref[1] &&
-                       vrfc != cm->comp_var_ref[1]) {
-              pred_context = 2;
-            } else {
-              pred_context = 4;
-            }
-          } else if (vrfa == vrfl) {  // comp/comp
-            pred_context = 4;
-          } else {
-            pred_context = 2;
-          }
-        }
-      } else if (above_in_image || left_in_image) {  // one edge available
-        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
-
-        if (edge->mbmi.ref_frame[0] == INTRA_FRAME) {
+        else
+          pred_context = 1;
+      } else if (l_sg || a_sg) {  // single/comp
+        MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+        MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+        if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1])
+          pred_context = 1;
+        else if (rfs == cm->comp_var_ref[1] && vrfc != cm->comp_var_ref[1])
           pred_context = 2;
-        } else if (edge->mbmi.ref_frame[1] > INTRA_FRAME) {
-          pred_context =
-              4 * edge->mbmi.ref_frame[var_ref_idx] != cm->comp_var_ref[1];
-        } else {
-          pred_context = 3 * edge->mbmi.ref_frame[0] != cm->comp_var_ref[1];
-        }
-      } else {  // no edges available (2)
+        else
+          pred_context = 4;
+      } else if (vrfa == vrfl) {  // comp/comp
+        pred_context = 4;
+      } else {
         pred_context = 2;
       }
-      assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-      break;
     }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (edge_mbmi->ref_frame[0] == INTRA_FRAME)
+      pred_context = 2;
+    else if (edge_mbmi->ref_frame[1] > INTRA_FRAME)
+      pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
+                              != cm->comp_var_ref[1]);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
 
-    case PRED_SINGLE_REF_P1: {
-      if (above_in_image && left_in_image) {  // both edges available
-        if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
-            left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
-          pred_context = 2;
-        } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME ||
-                   left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
-          const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ?
-                                  left_mi : above_mi;
-
-          if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
-            pred_context = 4 * (edge->mbmi.ref_frame[0] == LAST_FRAME);
-          } else {
-            pred_context = 1 + (edge->mbmi.ref_frame[0] == LAST_FRAME ||
-                                edge->mbmi.ref_frame[1] == LAST_FRAME);
-          }
-        } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME &&
-                   left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
-          pred_context = 2 * (above_mi->mbmi.ref_frame[0] == LAST_FRAME) +
-                         2 * (left_mi->mbmi.ref_frame[0] == LAST_FRAME);
-        } else if (above_mi->mbmi.ref_frame[1] > INTRA_FRAME &&
-                   left_mi->mbmi.ref_frame[1] > INTRA_FRAME) {
-          pred_context = 1 + (above_mi->mbmi.ref_frame[0] == LAST_FRAME ||
-                              above_mi->mbmi.ref_frame[1] == LAST_FRAME ||
-                              left_mi->mbmi.ref_frame[0] == LAST_FRAME ||
-                              left_mi->mbmi.ref_frame[1] == LAST_FRAME);
-        } else {
-          MV_REFERENCE_FRAME rfs = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME ?
-              above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];
-          MV_REFERENCE_FRAME crf1 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?
-              above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];
-          MV_REFERENCE_FRAME crf2 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?
-              above_mi->mbmi.ref_frame[1] : left_mi->mbmi.ref_frame[1];
-
-          if (rfs == LAST_FRAME) {
-            pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-          } else {
-            pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;
-          }
-        }
-      } else if (above_in_image || left_in_image) {  // one edge available
-        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
-
-        if (edge->mbmi.ref_frame[0] == INTRA_FRAME) {
-          pred_context = 2;
-        } else if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
-          pred_context = 4 * (edge->mbmi.ref_frame[0] == LAST_FRAME);
-        } else {
-          pred_context = 1 + (edge->mbmi.ref_frame[0] == LAST_FRAME ||
-                              edge->mbmi.ref_frame[1] == LAST_FRAME);
-        }
-      } else {  // no edges available (2)
-        pred_context = 2;
-      }
-      assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-      break;
+  return pred_context;
+}
+unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (above_in_image && left_in_image) {  // both edges available
+    if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
+        left_mbmi->ref_frame[0] == INTRA_FRAME) {
+      pred_context = 2;
+    } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
+               left_mbmi->ref_frame[0] == INTRA_FRAME) {
+      const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
+                                          left_mbmi : above_mbmi;
+
+      if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+      else
+        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                            edge_mbmi->ref_frame[1] == LAST_FRAME);
+    } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
+               left_mbmi->ref_frame[1] <= INTRA_FRAME) {
+      pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) +
+                     2 * (left_mbmi->ref_frame[0] == LAST_FRAME);
+    } else if (above_mbmi->ref_frame[1] > INTRA_FRAME &&
+               left_mbmi->ref_frame[1] > INTRA_FRAME) {
+      pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME ||
+                          above_mbmi->ref_frame[1] == LAST_FRAME ||
+                          left_mbmi->ref_frame[0] == LAST_FRAME ||
+                          left_mbmi->ref_frame[1] == LAST_FRAME);
+    } else {
+      MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ?
+              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+      MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
+              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+      MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
+              above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+
+      if (rfs == LAST_FRAME)
+        pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+      else
+        pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;
     }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (edge_mbmi->ref_frame[0] == INTRA_FRAME)
+      pred_context = 2;
+    else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+    else
+      pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                          edge_mbmi->ref_frame[1] == LAST_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
 
-    case PRED_SINGLE_REF_P2: {
-      if (above_in_image && left_in_image) {  // both edges available
-        if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
-            left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
-          pred_context = 2;
-        } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME ||
-                   left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
-          const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ?
-                                  left_mi : above_mi;
-
-          if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
-            if (edge->mbmi.ref_frame[0] == LAST_FRAME) {
-              pred_context = 3;
-            } else {
-              pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME);
-            }
-          } else {
-            pred_context = 1 + 2 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME ||
-                                    edge->mbmi.ref_frame[1] == GOLDEN_FRAME);
-          }
-        } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME &&
-                   left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
-          if (above_mi->mbmi.ref_frame[0] == LAST_FRAME &&
-              left_mi->mbmi.ref_frame[0] == LAST_FRAME) {
-            pred_context = 3;
-          } else if (above_mi->mbmi.ref_frame[0] == LAST_FRAME ||
-                     left_mi->mbmi.ref_frame[0] == LAST_FRAME) {
-            const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == LAST_FRAME ?
-                                    left_mi : above_mi;
-
-            pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME);
-          } else {
-            pred_context = 2 * (above_mi->mbmi.ref_frame[0] == GOLDEN_FRAME) +
-                           2 * (left_mi->mbmi.ref_frame[0] == GOLDEN_FRAME);
-          }
-        } else if (above_mi->mbmi.ref_frame[1] > INTRA_FRAME &&
-                   left_mi->mbmi.ref_frame[1] > INTRA_FRAME) {
-          if (above_mi->mbmi.ref_frame[0] == left_mi->mbmi.ref_frame[0] &&
-              above_mi->mbmi.ref_frame[1] == left_mi->mbmi.ref_frame[1]) {
-            pred_context = 3 * (above_mi->mbmi.ref_frame[0] == GOLDEN_FRAME ||
-                                above_mi->mbmi.ref_frame[1] == GOLDEN_FRAME ||
-                                left_mi->mbmi.ref_frame[0] == GOLDEN_FRAME ||
-                                left_mi->mbmi.ref_frame[1] == GOLDEN_FRAME);
-          } else {
-            pred_context = 2;
-          }
-        } else {
-          MV_REFERENCE_FRAME rfs = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME ?
-              above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];
-          MV_REFERENCE_FRAME crf1 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?
-              above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];
-          MV_REFERENCE_FRAME crf2 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?
-              above_mi->mbmi.ref_frame[1] : left_mi->mbmi.ref_frame[1];
-
-          if (rfs == GOLDEN_FRAME) {
-            pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
-          } else if (rfs == ALTREF_FRAME) {
-            pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
-          } else {
-            pred_context =
-                1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
-          }
-        }
-      } else if (above_in_image || left_in_image) {  // one edge available
-        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
-
-        if (edge->mbmi.ref_frame[0] == INTRA_FRAME ||
-            (edge->mbmi.ref_frame[0] == LAST_FRAME &&
-             edge->mbmi.ref_frame[1] <= INTRA_FRAME)) {
-          pred_context = 2;
-        } else if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
-          pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME);
-        } else {
-          pred_context = 3 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME ||
-                              edge->mbmi.ref_frame[1] == GOLDEN_FRAME);
-        }
-      } else {  // no edges available (2)
-        pred_context = 2;
-      }
-      assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-      break;
-    }
+unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
 
-    case PRED_TX_SIZE: {
-      int above_context, left_context;
-      int max_tx_size;
-      if (mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)
-        max_tx_size = TX_4X4;
-      else if (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
-        max_tx_size = TX_8X8;
-      else if (mi->mbmi.sb_type < BLOCK_SIZE_SB32X32)
-        max_tx_size = TX_16X16;
-      else
-        max_tx_size = TX_32X32;
-      above_context = left_context = max_tx_size;
-      if (above_in_image) {
-        above_context = (above_mi->mbmi.mb_skip_coeff ?
-                         max_tx_size : above_mi->mbmi.txfm_size);
-      }
-      if (left_in_image) {
-        left_context = (left_mi->mbmi.mb_skip_coeff ?
-                        max_tx_size : left_mi->mbmi.txfm_size);
-      }
-      if (!left_in_image) {
-        left_context = above_context;
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (above_in_image && left_in_image) {  // both edges available
+    if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
+        left_mbmi->ref_frame[0] == INTRA_FRAME) {
+      pred_context = 2;
+    } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
+               left_mbmi->ref_frame[0] == INTRA_FRAME) {
+      const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
+                                          left_mbmi : above_mbmi;
+
+      if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) {
+        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+      } else {
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                                edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
       }
-      if (!above_in_image) {
-        above_context = left_context;
+    } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
+               left_mbmi->ref_frame[1] <= INTRA_FRAME) {
+      if (above_mbmi->ref_frame[0] == LAST_FRAME &&
+          left_mbmi->ref_frame[0] == LAST_FRAME) {
+        pred_context = 3;
+      } else if (above_mbmi->ref_frame[0] == LAST_FRAME ||
+                 left_mbmi->ref_frame[0] == LAST_FRAME) {
+        const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == LAST_FRAME ?
+                                           left_mbmi : above_mbmi;
+
+        pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+      } else {
+        pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) +
+                       2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME);
       }
-      pred_context = (above_context + left_context > max_tx_size);
-      break;
+    } else if (above_mbmi->ref_frame[1] > INTRA_FRAME &&
+               left_mbmi->ref_frame[1] > INTRA_FRAME) {
+      if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] &&
+          above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1])
+        pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                            above_mbmi->ref_frame[1] == GOLDEN_FRAME ||
+                            left_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                            left_mbmi->ref_frame[1] == GOLDEN_FRAME);
+      else
+        pred_context = 2;
+    } else {
+      MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ?
+              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+      MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
+              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+      MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
+              above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+
+      if (rfs == GOLDEN_FRAME)
+        pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+      else if (rfs == ALTREF_FRAME)
+        pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
+      else
+        pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
     }
-
-    default:
-      assert(0);
-      pred_context = 0;  // *** add error trap code.
-      break;
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (edge_mbmi->ref_frame[0] == INTRA_FRAME ||
+        (edge_mbmi->ref_frame[0] == LAST_FRAME &&
+         edge_mbmi->ref_frame[1] <= INTRA_FRAME))
+      pred_context = 2;
+    else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                          edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
   }
-
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
   return pred_context;
 }
-
-// This function returns a context probability for coding a given
-// prediction signal
-vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
-                          const MACROBLOCKD *const xd,
-                          PRED_ID pred_id) {
-  const int pred_context = vp9_get_pred_context(cm, xd, pred_id);
-
-  switch (pred_id) {
-    case PRED_SEG_ID:
-      return cm->segment_pred_probs[pred_context];
-    case PRED_MBSKIP:
-      return cm->fc.mbskip_probs[pred_context];
-    case PRED_INTRA_INTER:
-      return cm->fc.intra_inter_prob[pred_context];
-    case PRED_COMP_INTER_INTER:
-      return cm->fc.comp_inter_prob[pred_context];
-    case PRED_COMP_REF_P:
-      return cm->fc.comp_ref_prob[pred_context];
-    case PRED_SINGLE_REF_P1:
-      return cm->fc.single_ref_prob[pred_context][0];
-    case PRED_SINGLE_REF_P2:
-      return cm->fc.single_ref_prob[pred_context][1];
-    default:
-      assert(0);
-      return 128;  // *** add error trap code.
-  }
-}
-
-// This function returns a context probability ptr for coding a given
-// prediction signal
-const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
-                                   const MACROBLOCKD *const xd,
-                                   PRED_ID pred_id) {
+// Returns a context number for the given MB prediction signal
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real blocks.
+// The prediction flags in these dummy entries are initialized to 0.
+unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
   const MODE_INFO *const mi = xd->mode_info_context;
-  const int pred_context = vp9_get_pred_context(cm, xd, pred_id);
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  const int max_tx_size = max_txsize_lookup[mi->mbmi.sb_type];
+  int above_context = max_tx_size;
+  int left_context = max_tx_size;
 
-  switch (pred_id) {
-    case PRED_SWITCHABLE_INTERP:
-      return &cm->fc.switchable_interp_prob[pred_context][0];
+  if (above_in_image)
+    above_context = above_mbmi->mb_skip_coeff ? max_tx_size
+                                              : above_mbmi->txfm_size;
 
-    case PRED_TX_SIZE:
-      if (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
-        return cm->fc.tx_probs_8x8p[pred_context];
-      else if (mi->mbmi.sb_type < BLOCK_SIZE_SB32X32)
-        return cm->fc.tx_probs_16x16p[pred_context];
-      else
-        return cm->fc.tx_probs_32x32p[pred_context];
+  if (left_in_image)
+    left_context = left_mbmi->mb_skip_coeff ? max_tx_size
+                                            : left_mbmi->txfm_size;
 
-    default:
-      assert(0);
-      return NULL;  // *** add error trap code.
-  }
-}
+  if (!left_in_image)
+    left_context = above_context;
 
-// This function returns the status of the given prediction signal.
-// I.e. is the predicted value for the given signal correct.
-unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
-                                PRED_ID pred_id) {
-  switch (pred_id) {
-    case PRED_SEG_ID:
-      return xd->mode_info_context->mbmi.seg_id_predicted;
-    case PRED_MBSKIP:
-      return xd->mode_info_context->mbmi.mb_skip_coeff;
-    default:
-      assert(0);
-      return 0;  // *** add error trap code.
-  }
+  if (!above_in_image)
+    above_context = left_context;
+
+  return above_context + left_context > max_tx_size;
 }
 
-// This function sets the status of the given prediction signal.
-// I.e. is the predicted value for the given signal correct.
-void vp9_set_pred_flag(MACROBLOCKD *const xd,
-                       PRED_ID pred_id,
-                       unsigned char pred_flag) {
-  const int mis = xd->mode_info_stride;
-  BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
-  const int bh = 1 << mi_height_log2(bsize);
+void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+                              int mi_row, int mi_col, uint8_t pred_flag) {
+  MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
   const int bw = 1 << mi_width_log2(bsize);
-#define sub(a, b) (b) < 0 ? (a) + (b) : (a)
-  const int x_mis = sub(bw, xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE));
-  const int y_mis = sub(bh, xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE));
-#undef sub
+  const int bh = 1 << mi_height_log2(bsize);
+  const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
   int x, y;
 
-  switch (pred_id) {
-    case PRED_SEG_ID:
-      for (y = 0; y < y_mis; y++) {
-        for (x = 0; x < x_mis; x++) {
-          xd->mode_info_context[y * mis + x].mbmi.seg_id_predicted = pred_flag;
-        }
-      }
-      break;
-
-    case PRED_MBSKIP:
-      for (y = 0; y < y_mis; y++) {
-        for (x = 0; x < x_mis; x++) {
-          xd->mode_info_context[y * mis + x].mbmi.mb_skip_coeff = pred_flag;
-        }
-      }
-      break;
-
-    default:
-      assert(0);
-      // *** add error trap code.
-      break;
-  }
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++)
+      mi[y * cm->mode_info_stride + x].mbmi.seg_id_predicted = pred_flag;
 }
 
+void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+                              int mi_row, int mi_col, uint8_t pred_flag) {
+  MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
+  const int bw = 1 << mi_width_log2(bsize);
+  const int bh = 1 << mi_height_log2(bsize);
+  const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  int x, y;
 
-// The following contain the guts of the prediction code used to
-// peredict various bitstream signals.
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++)
+      mi[y * cm->mode_info_stride + x].mbmi.mb_skip_coeff = pred_flag;
+}
 
-// Macroblock segment id prediction function
-int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type,
-                          int mi_row, int mi_col) {
-  const int mi_index = mi_row * cm->mi_cols + mi_col;
-  const int bw = 1 << mi_width_log2(sb_type);
-  const int bh = 1 << mi_height_log2(sb_type);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
+                       BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col) {
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = 1 << mi_width_log2(bsize);
+  const int bh = 1 << mi_height_log2(bsize);
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  int segment_id = INT_MAX;
-  int x, y;
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  int x, y, segment_id = INT_MAX;
 
-  for (y = 0; y < ymis; y++) {
-    for (x = 0; x < xmis; x++) {
-      const int index = mi_index + (y * cm->mi_cols + x);
-      segment_id = MIN(segment_id, cm->last_frame_seg_map[index]);
-    }
-  }
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++)
+      segment_id = MIN(segment_id,
+                       segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
   return segment_id;
 }
diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h
index b728724..e4b6575 100644
--- a/libvpx/vp9/common/vp9_pred_common.h
+++ b/libvpx/vp9/common/vp9_pred_common.h
@@ -14,40 +14,125 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-// Predicted items
-typedef enum {
-  PRED_SEG_ID = 0,  // Segment identifier
-  PRED_MBSKIP = 1,
-  PRED_SWITCHABLE_INTERP = 2,
-  PRED_INTRA_INTER = 3,
-  PRED_COMP_INTER_INTER = 4,
-  PRED_SINGLE_REF_P1 = 5,
-  PRED_SINGLE_REF_P2 = 6,
-  PRED_COMP_REF_P = 7,
-  PRED_TX_SIZE = 8
-} PRED_ID;
-
-unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
-                                   const MACROBLOCKD *const xd,
-                                   PRED_ID pred_id);
-
-vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
-                           const MACROBLOCKD *const xd,
-                           PRED_ID pred_id);
-
-const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
-                                   const MACROBLOCKD *const xd,
-                                   PRED_ID pred_id);
-
-unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
-                                PRED_ID pred_id);
-
-void vp9_set_pred_flag(MACROBLOCKD *const xd,
-                       PRED_ID pred_id,
-                       unsigned char pred_flag);
-
-
-int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type,
-                          int mi_row, int mi_col);
+int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
+                       BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col);
+
+
+static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+
+  return above_mbmi->seg_id_predicted +
+             (xd->left_available ? left_mbmi->seg_id_predicted : 0);
+}
+
+static INLINE vp9_prob vp9_get_pred_prob_seg_id(const MACROBLOCKD *xd) {
+  return xd->seg.pred_probs[vp9_get_pred_context_seg_id(xd)];
+}
+
+void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+                              int mi_row, int mi_col, uint8_t pred_flag);
+
+static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) {
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+
+  return above_mbmi->mb_skip_coeff +
+             (xd->left_available ? left_mbmi->mb_skip_coeff : 0);
+}
+
+static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
+                                                const MACROBLOCKD *xd) {
+  return cm->fc.mbskip_probs[vp9_get_pred_context_mbskip(xd)];
+}
+
+static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) {
+  return xd->mode_info_context->mbmi.mb_skip_coeff;
+}
+
+void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+                              int mi_row, int mi_col, uint8_t pred_flag);
+
+unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
+
+static INLINE const vp9_prob *vp9_get_pred_probs_switchable_interp(
+    const VP9_COMMON *cm, const MACROBLOCKD *xd) {
+  const int pred_context = vp9_get_pred_context_switchable_interp(xd);
+  return &cm->fc.switchable_interp_prob[pred_context][0];
+}
+
+unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_intra_inter(const VP9_COMMON *cm,
+                                                     const MACROBLOCKD *xd) {
+  const int pred_context = vp9_get_pred_context_intra_inter(xd);
+  return cm->fc.intra_inter_prob[pred_context];
+}
+
+unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
+                                                    const MACROBLOCKD *xd);
+
+
+static INLINE vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm,
+                                                          const MACROBLOCKD *xd) {
+  const int pred_context = vp9_get_pred_context_comp_inter_inter(cm, xd);
+  return cm->fc.comp_inter_prob[pred_context];
+}
+
+unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
+                                              const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
+                                                    const MACROBLOCKD *xd) {
+  const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd);
+  return cm->fc.comp_ref_prob[pred_context];
+}
+
+unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  const int pred_context = vp9_get_pred_context_single_ref_p1(xd);
+  return cm->fc.single_ref_prob[pred_context][0];
+}
+
+unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  const int pred_context = vp9_get_pred_context_single_ref_p2(xd);
+  return cm->fc.single_ref_prob[pred_context][1];
+}
+
+unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);
+
+static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context,
+                                    const struct tx_probs *tx_probs) {
+  if (bsize < BLOCK_SIZE_MB16X16)
+    return tx_probs->p8x8[context];
+  else if (bsize < BLOCK_SIZE_SB32X32)
+    return tx_probs->p16x16[context];
+  else
+    return tx_probs->p32x32[context];
+}
+
+static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
+                                     const struct tx_probs *tx_probs) {
+  const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  const int context = vp9_get_pred_context_tx_size(xd);
+  return get_tx_probs(bsize, context, tx_probs);
+}
+
+static void update_tx_counts(BLOCK_SIZE_TYPE bsize, uint8_t context,
+                             TX_SIZE tx_size, struct tx_counts *tx_counts) {
+  if (bsize >= BLOCK_SIZE_SB32X32)
+    tx_counts->p32x32[context][tx_size]++;
+  else if (bsize >= BLOCK_SIZE_MB16X16)
+    tx_counts->p16x16[context][tx_size]++;
+  else
+    tx_counts->p8x8[context][tx_size]++;
+}
 
 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_quant_common.c b/libvpx/vp9/common/vp9_quant_common.c
index 295c8e7..48d86c5 100644
--- a/libvpx/vp9/common/vp9_quant_common.c
+++ b/libvpx/vp9/common/vp9_quant_common.c
@@ -12,6 +12,79 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#if 1
+static const int16_t dc_qlookup[QINDEX_RANGE] = {
+     4,    8,    8,    9,   10,   11,   12,   12,
+    13,   14,   15,   16,   17,   18,   19,   19,
+    20,   21,   22,   23,   24,   25,   26,   26,
+    27,   28,   29,   30,   31,   32,   32,   33,
+    34,   35,   36,   37,   38,   38,   39,   40,
+    41,   42,   43,   43,   44,   45,   46,   47,
+    48,   48,   49,   50,   51,   52,   53,   53,
+    54,   55,   56,   57,   57,   58,   59,   60,
+    61,   62,   62,   63,   64,   65,   66,   66,
+    67,   68,   69,   70,   70,   71,   72,   73,
+    74,   74,   75,   76,   77,   78,   78,   79,
+    80,   81,   81,   82,   83,   84,   85,   85,
+    87,   88,   90,   92,   93,   95,   96,   98,
+    99,  101,  102,  104,  105,  107,  108,  110,
+   111,  113,  114,  116,  117,  118,  120,  121,
+   123,  125,  127,  129,  131,  134,  136,  138,
+   140,  142,  144,  146,  148,  150,  152,  154,
+   156,  158,  161,  164,  166,  169,  172,  174,
+   177,  180,  182,  185,  187,  190,  192,  195,
+   199,  202,  205,  208,  211,  214,  217,  220,
+   223,  226,  230,  233,  237,  240,  243,  247,
+   250,  253,  257,  261,  265,  269,  272,  276,
+   280,  284,  288,  292,  296,  300,  304,  309,
+   313,  317,  322,  326,  330,  335,  340,  344,
+   349,  354,  359,  364,  369,  374,  379,  384,
+   389,  395,  400,  406,  411,  417,  423,  429,
+   435,  441,  447,  454,  461,  467,  475,  482,
+   489,  497,  505,  513,  522,  530,  539,  549,
+   559,  569,  579,  590,  602,  614,  626,  640,
+   654,  668,  684,  700,  717,  736,  755,  775,
+   796,  819,  843,  869,  896,  925,  955,  988,
+  1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
+};
+
+static const int16_t ac_qlookup[QINDEX_RANGE] = {
+     4,    8,    9,   10,   11,   12,   13,   14,
+    15,   16,   17,   18,   19,   20,   21,   22,
+    23,   24,   25,   26,   27,   28,   29,   30,
+    31,   32,   33,   34,   35,   36,   37,   38,
+    39,   40,   41,   42,   43,   44,   45,   46,
+    47,   48,   49,   50,   51,   52,   53,   54,
+    55,   56,   57,   58,   59,   60,   61,   62,
+    63,   64,   65,   66,   67,   68,   69,   70,
+    71,   72,   73,   74,   75,   76,   77,   78,
+    79,   80,   81,   82,   83,   84,   85,   86,
+    87,   88,   89,   90,   91,   92,   93,   94,
+    95,   96,   97,   98,   99,  100,  101,  102,
+   104,  106,  108,  110,  112,  114,  116,  118,
+   120,  122,  124,  126,  128,  130,  132,  134,
+   136,  138,  140,  142,  144,  146,  148,  150,
+   152,  155,  158,  161,  164,  167,  170,  173,
+   176,  179,  182,  185,  188,  191,  194,  197,
+   200,  203,  207,  211,  215,  219,  223,  227,
+   231,  235,  239,  243,  247,  251,  255,  260,
+   265,  270,  275,  280,  285,  290,  295,  300,
+   305,  311,  317,  323,  329,  335,  341,  347,
+   353,  359,  366,  373,  380,  387,  394,  401,
+   408,  416,  424,  432,  440,  448,  456,  465,
+   474,  483,  492,  501,  510,  520,  530,  540,
+   550,  560,  571,  582,  593,  604,  615,  627,
+   639,  651,  663,  676,  689,  702,  715,  729,
+   743,  757,  771,  786,  801,  816,  832,  848,
+   864,  881,  898,  915,  933,  951,  969,  988,
+  1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151,
+  1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
+  1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567,
+  1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+};
+
+void vp9_init_quant_tables(void) { }
+#else
 static int16_t dc_qlookup[QINDEX_RANGE];
 static int16_t ac_qlookup[QINDEX_RANGE];
 
@@ -46,6 +119,7 @@ void vp9_init_quant_tables() {
                                                  0.5, ac_val));
   }
 }
+#endif
 
 int16_t vp9_dc_quant(int qindex, int delta) {
   return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
@@ -57,9 +131,9 @@ int16_t vp9_ac_quant(int qindex, int delta) {
 
 
 int vp9_get_qindex(MACROBLOCKD *xd, int segment_id, int base_qindex) {
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
-    const int data = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-    return xd->mb_segment_abs_delta == SEGMENT_ABSDATA ?
+  if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_ALT_Q);
+    return xd->seg.abs_delta == SEGMENT_ABSDATA ?
                data :  // Abs value
                clamp(base_qindex + data, 0, MAXQ);  // Delta value
   } else {
diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c
index b28d333..63e5646 100644
--- a/libvpx/vp9/common/vp9_reconinter.c
+++ b/libvpx/vp9/common/vp9_reconinter.c
@@ -16,6 +16,7 @@
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
+#include "./vpx_scale_rtcd.h"
 
 static int scale_value_x_with_scaling(int val,
                                       const struct scale_factors *scale) {
@@ -32,45 +33,42 @@ static int unscaled_value(int val, const struct scale_factors *scale) {
   return val;
 }
 
-static int_mv32 mv_q3_to_q4_with_scaling(const int_mv *src_mv,
-                                         const struct scale_factors *scale) {
-  // returns mv * scale + offset
-  int_mv32 result;
-  const int32_t mv_row_q4 = src_mv->as_mv.row << 1;
-  const int32_t mv_col_q4 = src_mv->as_mv.col << 1;
-
-  result.as_mv.row = (mv_row_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT)
-                      + scale->y_offset_q4;
-  result.as_mv.col = (mv_col_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT)
-                      + scale->x_offset_q4;
-  return result;
+static MV32 mv_q3_to_q4_with_scaling(const MV *mv,
+                                     const struct scale_factors *scale) {
+  const MV32 res = {
+    ((mv->row << 1) * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT)
+        + scale->y_offset_q4,
+    ((mv->col << 1) * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT)
+        + scale->x_offset_q4
+  };
+  return res;
 }
 
-static int_mv32 mv_q3_to_q4_without_scaling(const int_mv *src_mv,
-                                            const struct scale_factors *scale) {
-  // returns mv * scale + offset
-  int_mv32 result;
-
-  result.as_mv.row = src_mv->as_mv.row << 1;
-  result.as_mv.col = src_mv->as_mv.col << 1;
-  return result;
+static MV32 mv_q3_to_q4_without_scaling(const MV *mv,
+                                        const struct scale_factors *scale) {
+  const MV32 res = {
+     mv->row << 1,
+     mv->col << 1
+  };
+  return res;
 }
 
-static int32_t mv_component_q4_with_scaling(int mv_q4, int scale_fp,
-                                            int offset_q4) {
-  int32_t scaled_mv;
-  // returns the scaled and offset value of the mv component.
-  scaled_mv = (mv_q4 * scale_fp >> VP9_REF_SCALE_SHIFT) + offset_q4;
-
-  return scaled_mv;
+static MV32 mv_q4_with_scaling(const MV *mv,
+                               const struct scale_factors *scale) {
+  const MV32 res = {
+    (mv->row * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->y_offset_q4,
+    (mv->col * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->x_offset_q4
+  };
+  return res;
 }
 
-static int32_t mv_component_q4_without_scaling(int mv_q4, int scale_fp,
-                                               int offset_q4) {
-  // returns the scaled and offset value of the mv component.
-  (void)scale_fp;
-  (void)offset_q4;
-  return mv_q4;
+static MV32 mv_q4_without_scaling(const MV *mv,
+                                  const struct scale_factors *scale) {
+  const MV32 res = {
+    mv->row,
+    mv->col
+  };
+  return res;
 }
 
 static void set_offsets_with_scaling(struct scale_factors *scale,
@@ -112,13 +110,13 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
     scale->scale_value_y = unscaled_value;
     scale->set_scaled_offsets = set_offsets_without_scaling;
     scale->scale_mv_q3_to_q4 = mv_q3_to_q4_without_scaling;
-    scale->scale_mv_component_q4 = mv_component_q4_without_scaling;
+    scale->scale_mv_q4 = mv_q4_without_scaling;
   } else {
     scale->scale_value_x = scale_value_x_with_scaling;
     scale->scale_value_y = scale_value_y_with_scaling;
     scale->set_scaled_offsets = set_offsets_with_scaling;
     scale->scale_mv_q3_to_q4 = mv_q3_to_q4_with_scaling;
-    scale->scale_mv_component_q4 = mv_component_q4_with_scaling;
+    scale->scale_mv_q4 = mv_q4_with_scaling;
   }
 
   // TODO(agrange): Investigate the best choice of functions to use here
@@ -175,9 +173,7 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
   if (xd->mode_info_context) {
     MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
 
-    set_scale_factors(xd,
-                      mbmi->ref_frame[0] - 1,
-                      mbmi->ref_frame[1] - 1,
+    set_scale_factors(xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1,
                       cm->active_ref_scale);
   }
 
@@ -199,124 +195,20 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
   assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
 }
 
-void vp9_copy_mem16x16_c(const uint8_t *src,
-                         int src_stride,
-                         uint8_t *dst,
-                         int dst_stride) {
-  int r;
-
-  for (r = 0; r < 16; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-    dst[0] = src[0];
-    dst[1] = src[1];
-    dst[2] = src[2];
-    dst[3] = src[3];
-    dst[4] = src[4];
-    dst[5] = src[5];
-    dst[6] = src[6];
-    dst[7] = src[7];
-    dst[8] = src[8];
-    dst[9] = src[9];
-    dst[10] = src[10];
-    dst[11] = src[11];
-    dst[12] = src[12];
-    dst[13] = src[13];
-    dst[14] = src[14];
-    dst[15] = src[15];
-
-#else
-    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
-    ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];
-    ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];
-
-#endif
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_copy_mem8x8_c(const uint8_t *src,
-                       int src_stride,
-                       uint8_t *dst,
-                       int dst_stride) {
-  int r;
-
-  for (r = 0; r < 8; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-    dst[0] = src[0];
-    dst[1] = src[1];
-    dst[2] = src[2];
-    dst[3] = src[3];
-    dst[4] = src[4];
-    dst[5] = src[5];
-    dst[6] = src[6];
-    dst[7] = src[7];
-#else
-    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
-#endif
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_copy_mem8x4_c(const uint8_t *src,
-                       int src_stride,
-                       uint8_t *dst,
-                       int dst_stride) {
-  int r;
-
-  for (r = 0; r < 4; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-    dst[0] = src[0];
-    dst[1] = src[1];
-    dst[2] = src[2];
-    dst[3] = src[3];
-    dst[4] = src[4];
-    dst[5] = src[5];
-    dst[6] = src[6];
-    dst[7] = src[7];
-#else
-    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
-#endif
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
-                               const int_mv *mv_q3,
+                               const int_mv *src_mv,
                                const struct scale_factors *scale,
                                int w, int h, int weight,
-                               const struct subpix_fn_table *subpix) {
-  int_mv32 mv = scale->scale_mv_q3_to_q4(mv_q3, scale);
-  src += (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4);
-  scale->predict[!!(mv.as_mv.col & 15)][!!(mv.as_mv.row & 15)][weight](
-      src, src_stride, dst, dst_stride,
-      subpix->filter_x[mv.as_mv.col & 15], scale->x_step_q4,
-      subpix->filter_y[mv.as_mv.row & 15], scale->y_step_q4,
-      w, h);
-}
-
-void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride,
-                                  const int_mv *mv_q4,
-                                  const struct scale_factors *scale,
-                                  int w, int h, int weight,
-                                  const struct subpix_fn_table *subpix) {
-  const int scaled_mv_row_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.row,
-                                                            scale->y_scale_fp,
-                                                            scale->y_offset_q4);
-  const int scaled_mv_col_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.col,
-                                                            scale->x_scale_fp,
-                                                            scale->x_offset_q4);
-  const int subpel_x = scaled_mv_col_q4 & 15;
-  const int subpel_y = scaled_mv_row_q4 & 15;
-
-  src += (scaled_mv_row_q4 >> 4) * src_stride + (scaled_mv_col_q4 >> 4);
+                               const struct subpix_fn_table *subpix,
+                               enum mv_precision precision) {
+  const MV32 mv = precision == MV_PRECISION_Q4
+                     ? scale->scale_mv_q4(&src_mv->as_mv, scale)
+                     : scale->scale_mv_q3_to_q4(&src_mv->as_mv, scale);
+  const int subpel_x = mv.col & 15;
+  const int subpel_y = mv.row & 15;
+
+  src += (mv.row >> 4) * src_stride + (mv.col >> 4);
   scale->predict[!!subpel_x][!!subpel_y][weight](
       src, src_stride, dst, dst_stride,
       subpix->filter_x[subpel_x], scale->x_step_q4,
@@ -387,17 +279,16 @@ static void build_inter_predictors(int plane, int block,
   MACROBLOCKD * const xd = arg->xd;
   const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
   const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-  const int bh = 4 << bhl, bw = 4 << bwl;
   const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl);
   const int use_second_ref = xd->mode_info_context->mbmi.ref_frame[1] > 0;
   int which_mv;
 
-  assert(x < bw);
-  assert(y < bh);
+  assert(x < (4 << bwl));
+  assert(y < (4 << bhl));
   assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
-         4 << pred_w == bw);
+         4 << pred_w == (4 << bwl));
   assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
-         4 << pred_h == bh);
+         4 << pred_h == (4 << bhl));
 
   for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
     // source
@@ -405,8 +296,7 @@ static void build_inter_predictors(int plane, int block,
     const int pre_stride = arg->pre_stride[which_mv][plane];
     const uint8_t *const pre = base_pre +
         scaled_buffer_offset(x, y, pre_stride, &xd->scale_factor[which_mv]);
-    struct scale_factors * const scale =
-      plane == 0 ? &xd->scale_factor[which_mv] : &xd->scale_factor_uv[which_mv];
+    struct scale_factors * const scale = &xd->scale_factor[which_mv];
 
     // dest
     uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x;
@@ -446,11 +336,11 @@ static void build_inter_predictors(int plane, int block,
                                                  xd->mb_to_bottom_edge);
     scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
 
-    vp9_build_inter_predictor_q4(pre, pre_stride,
-                                 dst, arg->dst_stride[plane],
-                                 &clamped_mv, &xd->scale_factor[which_mv],
-                                 4 << pred_w, 4 << pred_h, which_mv,
-                                 &xd->subpix);
+    vp9_build_inter_predictor(pre, pre_stride,
+                              dst, arg->dst_stride[plane],
+                              &clamped_mv, &xd->scale_factor[which_mv],
+                              4 << pred_w, 4 << pred_h, which_mv,
+                              &xd->subpix, MV_PRECISION_Q4);
   }
 }
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
@@ -505,13 +395,6 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd,
   vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
 }
 
-/*encoder only*/
-void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,
-                                        int mb_row, int mb_col) {
-  vp9_build_inter_predictors_sbuv(xd, mb_row, mb_col,
-                                  BLOCK_SIZE_MB16X16);
-}
-
 // TODO(dkovalev: find better place for this function)
 void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
   const int ref = cm->active_ref_idx[i];
@@ -523,6 +406,10 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
     vp9_setup_scale_factors_for_frame(sf,
                                       fb->y_crop_width, fb->y_crop_height,
                                       cm->width, cm->height);
+
+    if (sf->x_scale_fp != VP9_REF_NO_SCALE ||
+        sf->y_scale_fp != VP9_REF_NO_SCALE)
+      vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y);
   }
 }
 
diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h
index 4e52185..e37750d 100644
--- a/libvpx/vp9/common/vp9_reconinter.h
+++ b/libvpx/vp9/common/vp9_reconinter.h
@@ -42,14 +42,8 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                const int_mv *mv_q3,
                                const struct scale_factors *scale,
                                int w, int h, int do_avg,
-                               const struct subpix_fn_table *subpix);
-
-void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride,
-                                  const int_mv *mv_q4,
-                                  const struct scale_factors *scale,
-                                  int w, int h, int do_avg,
-                                  const struct subpix_fn_table *subpix);
+                               const struct subpix_fn_table *subpix,
+                               enum mv_precision precision);
 
 static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
                                 const struct scale_factors *scale) {
@@ -86,43 +80,29 @@ static void setup_dst_planes(MACROBLOCKD *xd,
   }
 }
 
-static void setup_pre_planes(MACROBLOCKD *xd,
-                             const YV12_BUFFER_CONFIG *src0,
-                             const YV12_BUFFER_CONFIG *src1,
+static void setup_pre_planes(MACROBLOCKD *xd, int i,
+                             const YV12_BUFFER_CONFIG *src,
                              int mi_row, int mi_col,
-                             const struct scale_factors *scale,
-                             const struct scale_factors *scale_uv) {
-  const YV12_BUFFER_CONFIG *srcs[2] = {src0, src1};
-  int i, j;
-
-  for (i = 0; i < 2; ++i) {
-    const YV12_BUFFER_CONFIG *src = srcs[i];
-    if (src) {
-      uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                             src->alpha_buffer};
-      int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                        src->alpha_stride};
-
-      for (j = 0; j < MAX_MB_PLANE; ++j) {
-        struct macroblockd_plane *pd = &xd->plane[j];
-        const struct scale_factors *sf = j ? scale_uv : scale;
-        setup_pred_plane(&pd->pre[i],
-                         buffers[j], strides[j],
-                         mi_row, mi_col, sf ? &sf[i] : NULL,
-                         pd->subsampling_x, pd->subsampling_y);
-      }
+                             const struct scale_factors *sf) {
+  if (src) {
+    int j;
+    uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                           src->alpha_buffer};
+    int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                      src->alpha_stride};
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      struct macroblockd_plane *pd = &xd->plane[j];
+      setup_pred_plane(&pd->pre[i], buffers[j], strides[j],
+                     mi_row, mi_col, sf, pd->subsampling_x, pd->subsampling_y);
     }
   }
 }
 
-static void set_scale_factors(MACROBLOCKD *xd,
-    int ref0, int ref1,
-    struct scale_factors scale_factor[MAX_REF_FRAMES]) {
-
-  xd->scale_factor[0] = scale_factor[ref0 >= 0 ? ref0 : 0];
-  xd->scale_factor[1] = scale_factor[ref1 >= 0 ? ref1 : 0];
-  xd->scale_factor_uv[0] = xd->scale_factor[0];
-  xd->scale_factor_uv[1] = xd->scale_factor[1];
+static void set_scale_factors(MACROBLOCKD *xd, int ref0, int ref1,
+                              struct scale_factors sf[MAX_REF_FRAMES]) {
+  xd->scale_factor[0] = sf[ref0 >= 0 ? ref0 : 0];
+  xd->scale_factor[1] = sf[ref1 >= 0 ? ref1 : 0];
 }
 
 void vp9_setup_scale_factors(VP9_COMMON *cm, int i);
diff --git a/libvpx/vp9/common/vp9_reconintra.c b/libvpx/vp9/common/vp9_reconintra.c
index 85dfe51..f351224 100644
--- a/libvpx/vp9/common/vp9_reconintra.c
+++ b/libvpx/vp9/common/vp9_reconintra.c
@@ -15,187 +15,351 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
 
-static void d27_predictor(uint8_t *ypred_ptr, int y_stride,
-                          int bw, int bh,
-                          uint8_t *yabove_row, uint8_t *yleft_col) {
+const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
+    DCT_DCT,    // DC
+    ADST_DCT,   // V
+    DCT_ADST,   // H
+    DCT_DCT,    // D45
+    ADST_ADST,  // D135
+    ADST_DCT,   // D117
+    DCT_ADST,   // D153
+    DCT_ADST,   // D27
+    ADST_DCT,   // D63
+    ADST_ADST,  // TM
+    DCT_DCT,    // NEARESTMV
+    DCT_DCT,    // NEARMV
+    DCT_DCT,    // ZEROMV
+    DCT_DCT     // NEWMV
+};
+
+#define intra_pred_sized(type, size) \
+void vp9_##type##_predictor_##size##x##size##_c(uint8_t *pred_ptr, \
+                                                ptrdiff_t stride, \
+                                                uint8_t *above_row, \
+                                                uint8_t *left_col) { \
+  type##_predictor(pred_ptr, stride, size, above_row, left_col); \
+}
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32)
+
+static INLINE void d27_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+                                 uint8_t *above_row, uint8_t *left_col) {
   int r, c;
   // first column
-  for (r = 0; r < bh - 1; ++r) {
-      ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r] +
-                                                   yleft_col[r + 1], 1);
+  for (r = 0; r < bs - 1; ++r) {
+      pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] +
+                                                   left_col[r + 1], 1);
   }
-  ypred_ptr[(bh - 1) * y_stride] = yleft_col[bh-1];
-  ypred_ptr++;
+  pred_ptr[(bs - 1) * stride] = left_col[bs - 1];
+  pred_ptr++;
   // second column
-  for (r = 0; r < bh - 2; ++r) {
-      ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r] +
-                                                   yleft_col[r + 1] * 2 +
-                                                   yleft_col[r + 2], 2);
+  for (r = 0; r < bs - 2; ++r) {
+      pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] +
+                                                   left_col[r + 1] * 2 +
+                                                   left_col[r + 2], 2);
   }
-  ypred_ptr[(bh - 2) * y_stride] = ROUND_POWER_OF_TWO(yleft_col[bh - 2] +
-                                                      yleft_col[bh - 1] * 3,
+  pred_ptr[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left_col[bs - 2] +
+                                                      left_col[bs - 1] * 3,
                                                       2);
-  ypred_ptr[(bh - 1) * y_stride] = yleft_col[bh-1];
-  ypred_ptr++;
+  pred_ptr[(bs - 1) * stride] = left_col[bs - 1];
+  pred_ptr++;
 
   // rest of last row
-  for (c = 0; c < bw - 2; ++c) {
-    ypred_ptr[(bh - 1) * y_stride + c] = yleft_col[bh-1];
+  for (c = 0; c < bs - 2; ++c) {
+    pred_ptr[(bs - 1) * stride + c] = left_col[bs - 1];
   }
 
-  for (r = bh - 2; r >= 0; --r) {
-    for (c = 0; c < bw - 2; ++c) {
-      ypred_ptr[r * y_stride + c] = ypred_ptr[(r + 1) * y_stride + c - 2];
+  for (r = bs - 2; r >= 0; --r) {
+    for (c = 0; c < bs - 2; ++c) {
+      pred_ptr[r * stride + c] = pred_ptr[(r + 1) * stride + c - 2];
     }
   }
 }
+intra_pred_allsizes(d27)
 
-static void d63_predictor(uint8_t *ypred_ptr, int y_stride,
-                          int bw, int bh,
-                          uint8_t *yabove_row, uint8_t *yleft_col) {
+static INLINE void d63_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+                                 uint8_t *above_row, uint8_t *left_col) {
   int r, c;
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
       if (r & 1) {
-        ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[r/2 + c] +
-                                          yabove_row[r/2 + c + 1] * 2 +
-                                          yabove_row[r/2 + c + 2], 2);
+        pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] +
+                                         above_row[r/2 + c + 1] * 2 +
+                                         above_row[r/2 + c + 2], 2);
       } else {
-        ypred_ptr[c] =ROUND_POWER_OF_TWO(yabove_row[r/2 + c] +
-                                         yabove_row[r/2+ c + 1], 1);
+        pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] +
+                                         above_row[r/2+ c + 1], 1);
       }
     }
-    ypred_ptr += y_stride;
+    pred_ptr += stride;
   }
 }
+intra_pred_allsizes(d63)
 
-static void d45_predictor(uint8_t *ypred_ptr, int y_stride,
-                          int bw, int bh,
-                          uint8_t *yabove_row, uint8_t *yleft_col) {
+static INLINE void d45_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+                                 uint8_t *above_row, uint8_t *left_col) {
   int r, c;
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      if (r + c + 2 < bw * 2)
-        ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[r + c] +
-                                          yabove_row[r + c + 1] * 2 +
-                                          yabove_row[r + c + 2], 2);
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      if (r + c + 2 < bs * 2)
+        pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r + c] +
+                                         above_row[r + c + 1] * 2 +
+                                         above_row[r + c + 2], 2);
       else
-        ypred_ptr[c] = yabove_row[bw * 2 - 1];
+        pred_ptr[c] = above_row[bs * 2 - 1];
     }
-    ypred_ptr += y_stride;
+    pred_ptr += stride;
   }
 }
+intra_pred_allsizes(d45)
 
-static void d117_predictor(uint8_t *ypred_ptr, int y_stride,
-                           int bw, int bh,
-                           uint8_t *yabove_row, uint8_t *yleft_col) {
+static INLINE void d117_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+                                  uint8_t *above_row, uint8_t *left_col) {
   int r, c;
   // first row
-  for (c = 0; c < bw; c++)
-    ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] + yabove_row[c], 1);
-  ypred_ptr += y_stride;
+  for (c = 0; c < bs; c++)
+    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] + above_row[c], 1);
+  pred_ptr += stride;
 
   // second row
-  ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] +
-                                    yabove_row[-1] * 2 +
-                                    yabove_row[0], 2);
-  for (c = 1; c < bw; c++)
-    ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 2] +
-                                      yabove_row[c - 1] * 2 +
-                                      yabove_row[c], 2);
-  ypred_ptr += y_stride;
+  pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
+                                   above_row[-1] * 2 +
+                                   above_row[0], 2);
+  for (c = 1; c < bs; c++)
+    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] +
+                                     above_row[c - 1] * 2 +
+                                     above_row[c], 2);
+  pred_ptr += stride;
 
   // the rest of first col
-  ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] +
-                                    yleft_col[0] * 2 +
-                                    yleft_col[1], 2);
-  for (r = 3; r < bh; ++r)
-    ypred_ptr[(r-2) * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 3] +
-                                                     yleft_col[r - 2] * 2 +
-                                                     yleft_col[r - 1], 2);
+  pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] +
+                                   left_col[0] * 2 +
+                                   left_col[1], 2);
+  for (r = 3; r < bs; ++r)
+    pred_ptr[(r-2) * stride] = ROUND_POWER_OF_TWO(left_col[r - 3] +
+                                                  left_col[r - 2] * 2 +
+                                                  left_col[r - 1], 2);
   // the rest of the block
-  for (r = 2; r < bh; ++r) {
-    for (c = 1; c < bw; c++)
-      ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1];
-    ypred_ptr += y_stride;
+  for (r = 2; r < bs; ++r) {
+    for (c = 1; c < bs; c++)
+      pred_ptr[c] = pred_ptr[-2 * stride + c - 1];
+    pred_ptr += stride;
   }
 }
+intra_pred_allsizes(d117)
+
+static INLINE void d135_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+                                  uint8_t *above_row, uint8_t *left_col) {
+  int r, c;
+  pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
+                                   above_row[-1] * 2 +
+                                   above_row[0], 2);
+  for (c = 1; c < bs; c++)
+    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] +
+                                     above_row[c - 1] * 2 +
+                                     above_row[c], 2);
+
+  pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] +
+                                        left_col[0] * 2 +
+                                        left_col[1], 2);
+  for (r = 2; r < bs; ++r)
+    pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] +
+                                              left_col[r - 1] * 2 +
+                                              left_col[r], 2);
 
+  pred_ptr += stride;
+  for (r = 1; r < bs; ++r) {
+    for (c = 1; c < bs; c++)
+      pred_ptr[c] = pred_ptr[-stride + c - 1];
+    pred_ptr += stride;
+  }
+}
+intra_pred_allsizes(d135)
 
-static void d135_predictor(uint8_t *ypred_ptr, int y_stride,
-                           int bw, int bh,
-                           uint8_t *yabove_row, uint8_t *yleft_col) {
+static INLINE void d153_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+                                  uint8_t *above_row, uint8_t *left_col) {
   int r, c;
-  ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] +
-                                    yabove_row[-1] * 2 +
-                                    yabove_row[0], 2);
-  for (c = 1; c < bw; c++)
-    ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 2] +
-                                      yabove_row[c - 1] * 2 +
-                                      yabove_row[c], 2);
-
-  ypred_ptr[y_stride] = ROUND_POWER_OF_TWO(yabove_row[-1] +
-                                           yleft_col[0] * 2 +
-                                           yleft_col[1], 2);
-  for (r = 2; r < bh; ++r)
-    ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 2] +
-                                                 yleft_col[r - 1] * 2 +
-                                                 yleft_col[r], 2);
-
-  ypred_ptr += y_stride;
-  for (r = 1; r < bh; ++r) {
-    for (c = 1; c < bw; c++)
-      ypred_ptr[c] = ypred_ptr[-y_stride + c - 1];
-    ypred_ptr += y_stride;
+  pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] + left_col[0], 1);
+  for (r = 1; r < bs; r++)
+    pred_ptr[r * stride] =
+        ROUND_POWER_OF_TWO(left_col[r - 1] + left_col[r], 1);
+  pred_ptr++;
+
+  pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
+                                   above_row[-1] * 2 +
+                                   above_row[0], 2);
+  pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] +
+                                        left_col[0] * 2 +
+                                        left_col[1], 2);
+  for (r = 2; r < bs; r++)
+    pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] +
+                                              left_col[r - 1] * 2 +
+                                              left_col[r], 2);
+  pred_ptr++;
+
+  for (c = 0; c < bs - 2; c++)
+    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] +
+                                     above_row[c] * 2 +
+                                     above_row[c + 1], 2);
+  pred_ptr += stride;
+  for (r = 1; r < bs; ++r) {
+    for (c = 0; c < bs - 2; c++)
+      pred_ptr[c] = pred_ptr[-stride + c - 2];
+    pred_ptr += stride;
+  }
+}
+intra_pred_allsizes(d153)
+
+static INLINE void v_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+                       uint8_t *above_row, uint8_t *left_col) {
+  int r;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memcpy(pred_ptr, above_row, bs);
+    pred_ptr += stride;
   }
 }
+intra_pred_allsizes(v)
 
-static void d153_predictor(uint8_t *ypred_ptr,
-                           int y_stride,
-                           int bw, int bh,
-                           uint8_t *yabove_row,
-                           uint8_t *yleft_col) {
+static INLINE void h_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+                               uint8_t *above_row, uint8_t *left_col) {
+  int r;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset(pred_ptr, left_col[r], bs);
+    pred_ptr += stride;
+  }
+}
+intra_pred_allsizes(h)
+
+static INLINE void tm_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+                                uint8_t *above_row, uint8_t *left_col) {
   int r, c;
-  ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] + yleft_col[0], 1);
-  for (r = 1; r < bh; r++)
-    ypred_ptr[r * y_stride] =
-        ROUND_POWER_OF_TWO(yleft_col[r - 1] + yleft_col[r], 1);
-  ypred_ptr++;
-
-  ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] +
-                                    yabove_row[-1] * 2 +
-                                    yabove_row[0], 2);
-  ypred_ptr[y_stride] = ROUND_POWER_OF_TWO(yabove_row[-1] +
-                                           yleft_col[0] * 2 +
-                                           yleft_col[1], 2);
-  for (r = 2; r < bh; r++)
-    ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 2] +
-                                                 yleft_col[r - 1] * 2 +
-                                                 yleft_col[r], 2);
-  ypred_ptr++;
-
-  for (c = 0; c < bw - 2; c++)
-    ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] +
-                                      yabove_row[c] * 2 +
-                                      yabove_row[c + 1], 2);
-  ypred_ptr += y_stride;
-  for (r = 1; r < bh; ++r) {
-    for (c = 0; c < bw - 2; c++)
-      ypred_ptr[c] = ypred_ptr[-y_stride + c - 2];
-    ypred_ptr += y_stride;
+  int ytop_left = above_row[-1];
+
+  for (r = 0; r < bs; r++) {
+    for (c = 0; c < bs; c++)
+      pred_ptr[c] = clip_pixel(left_col[r] + above_row[c] - ytop_left);
+    pred_ptr += stride;
+  }
+}
+intra_pred_allsizes(tm)
+
+static INLINE void dc_128_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+                                    uint8_t *above_row, uint8_t *left_col) {
+  int r;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset(pred_ptr, 128, bs);
+    pred_ptr += stride;
+  }
+}
+intra_pred_allsizes(dc_128)
+
+static INLINE void dc_left_predictor(uint8_t *pred_ptr, ptrdiff_t stride,
+                                     int bs,
+                                     uint8_t *above_row, uint8_t *left_col) {
+  int i, r;
+  int expected_dc = 128;
+  int average = 0;
+  const int count = bs;
+
+  for (i = 0; i < bs; i++)
+    average += left_col[i];
+  expected_dc = (average + (count >> 1)) / count;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset(pred_ptr, expected_dc, bs);
+    pred_ptr += stride;
+  }
+}
+intra_pred_allsizes(dc_left)
+
+static INLINE void dc_top_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+                                    uint8_t *above_row, uint8_t *left_col) {
+  int i, r;
+  int expected_dc = 128;
+  int average = 0;
+  const int count = bs;
+
+  for (i = 0; i < bs; i++)
+    average += above_row[i];
+  expected_dc = (average + (count >> 1)) / count;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset(pred_ptr, expected_dc, bs);
+    pred_ptr += stride;
+  }
+}
+intra_pred_allsizes(dc_top)
+
+static INLINE void dc_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+                                uint8_t *above_row, uint8_t *left_col) {
+  int i, r;
+  int expected_dc = 128;
+  int average = 0;
+  const int count = 2 * bs;
+
+  for (i = 0; i < bs; i++)
+    average += above_row[i];
+  for (i = 0; i < bs; i++)
+    average += left_col[i];
+  expected_dc = (average + (count >> 1)) / count;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset(pred_ptr, expected_dc, bs);
+    pred_ptr += stride;
   }
 }
+intra_pred_allsizes(dc)
+#undef intra_pred_allsizes
+
+typedef void (*intra_pred_fn)(uint8_t *pred_ptr, ptrdiff_t stride,
+                              uint8_t *above_row, uint8_t *left_col);
+
+static intra_pred_fn pred[VP9_INTRA_MODES][4];
+static intra_pred_fn dc_pred[2][2][4];
+
+static void init_intra_pred_fn_ptrs(void) {
+#define intra_pred_allsizes(l, type) \
+  l[0] = vp9_##type##_predictor_4x4; \
+  l[1] = vp9_##type##_predictor_8x8; \
+  l[2] = vp9_##type##_predictor_16x16; \
+  l[3] = vp9_##type##_predictor_32x32
+
+  intra_pred_allsizes(pred[V_PRED], v);
+  intra_pred_allsizes(pred[H_PRED], h);
+  intra_pred_allsizes(pred[D27_PRED], d27);
+  intra_pred_allsizes(pred[D45_PRED], d45);
+  intra_pred_allsizes(pred[D63_PRED], d63);
+  intra_pred_allsizes(pred[D117_PRED], d117);
+  intra_pred_allsizes(pred[D135_PRED], d135);
+  intra_pred_allsizes(pred[D153_PRED], d153);
+  intra_pred_allsizes(pred[TM_PRED], tm);
+
+  intra_pred_allsizes(dc_pred[0][0], dc_128);
+  intra_pred_allsizes(dc_pred[0][1], dc_top);
+  intra_pred_allsizes(dc_pred[1][0], dc_left);
+  intra_pred_allsizes(dc_pred[1][1], dc);
+
+#undef intra_pred_allsizes
+}
 
-void vp9_build_intra_predictors(uint8_t *src, int src_stride,
-                                uint8_t *ypred_ptr,
-                                int y_stride, int mode,
-                                int bw, int bh,
-                                int up_available, int left_available,
-                                int right_available) {
-  int r, c, i;
-  uint8_t yleft_col[64], yabove_data[129], ytop_left;
-  uint8_t *yabove_row = yabove_data + 1;
+static void build_intra_predictors(uint8_t *src, int src_stride,
+                                   uint8_t *pred_ptr, int stride,
+                                   MB_PREDICTION_MODE mode, TX_SIZE txsz,
+                                   int up_available, int left_available,
+                                   int right_available) {
+  int i;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, yabove_data, 128 + 16);
+  uint8_t *above_row = yabove_data + 16;
+  const int bs = 4 << txsz;
 
   // 127 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
@@ -204,124 +368,37 @@ void vp9_build_intra_predictors(uint8_t *src, int src_stride,
   // 129  G   H  ..  S   T   T   T   T   T
   // ..
 
-  assert(bw == bh);
-
+  once(init_intra_pred_fn_ptrs);
   if (left_available) {
-    for (i = 0; i < bh; i++)
-      yleft_col[i] = src[i * src_stride - 1];
+    for (i = 0; i < bs; i++)
+      left_col[i] = src[i * src_stride - 1];
   } else {
-    vpx_memset(yleft_col, 129, bh);
+    vpx_memset(left_col, 129, bs);
   }
 
   if (up_available) {
-    uint8_t *yabove_ptr = src - src_stride;
-    vpx_memcpy(yabove_row, yabove_ptr, bw);
-    if (bw == 4 && right_available)
-      vpx_memcpy(yabove_row + bw, yabove_ptr + bw, bw);
-    else
-      vpx_memset(yabove_row + bw, yabove_row[bw -1], bw);
-    ytop_left = left_available ? yabove_ptr[-1] : 129;
-  } else {
-    vpx_memset(yabove_row, 127, bw * 2);
-    ytop_left = 127;
-  }
-  yabove_row[-1] = ytop_left;
-
-  switch (mode) {
-    case DC_PRED: {
-      int i;
-      int expected_dc = 128;
-      int average = 0;
-      int count = 0;
-
-      if (up_available || left_available) {
-        if (up_available) {
-          for (i = 0; i < bw; i++)
-            average += yabove_row[i];
-          count += bw;
-        }
-        if (left_available) {
-          for (i = 0; i < bh; i++)
-            average += yleft_col[i];
-          count += bh;
-        }
-        expected_dc = (average + (count >> 1)) / count;
-      }
-      for (r = 0; r < bh; r++) {
-        vpx_memset(ypred_ptr, expected_dc, bw);
-        ypred_ptr += y_stride;
-      }
+    uint8_t *above_ptr = src - src_stride;
+    if (bs == 4 && right_available && left_available) {
+      above_row = above_ptr;
+    } else {
+      vpx_memcpy(above_row, above_ptr, bs);
+      if (bs == 4 && right_available)
+        vpx_memcpy(above_row + bs, above_ptr + bs, bs);
+      else
+        vpx_memset(above_row + bs, above_row[bs - 1], bs);
+      above_row[-1] = left_available ? above_ptr[-1] : 129;
     }
-    break;
-    case V_PRED:
-      for (r = 0; r < bh; r++) {
-        vpx_memcpy(ypred_ptr, yabove_row, bw);
-        ypred_ptr += y_stride;
-      }
-      break;
-    case H_PRED:
-      for (r = 0; r < bh; r++) {
-        vpx_memset(ypred_ptr, yleft_col[r], bw);
-        ypred_ptr += y_stride;
-      }
-      break;
-    case TM_PRED:
-      for (r = 0; r < bh; r++) {
-        for (c = 0; c < bw; c++)
-          ypred_ptr[c] = clip_pixel(yleft_col[r] + yabove_row[c] - ytop_left);
-        ypred_ptr += y_stride;
-      }
-      break;
-    case D45_PRED:
-      d45_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
-      break;
-    case D135_PRED:
-      d135_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
-      break;
-    case D117_PRED:
-      d117_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
-      break;
-    case D153_PRED:
-      d153_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
-      break;
-    case D27_PRED:
-      d27_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
-      break;
-    case D63_PRED:
-      d63_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
-      break;
-    default:
-      break;
+  } else {
+    vpx_memset(above_row, 127, bs * 2);
+    above_row[-1] = 127;
   }
-}
-
-void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd,
-                                      BLOCK_SIZE_TYPE bsize) {
-  const struct macroblockd_plane* const pd = &xd->plane[0];
-  const int bw = plane_block_width(bsize, pd);
-  const int bh = plane_block_height(bsize, pd);
-  vp9_build_intra_predictors(pd->dst.buf, pd->dst.stride,
-                             pd->dst.buf, pd->dst.stride,
-                             xd->mode_info_context->mbmi.mode,
-                             bw, bh, xd->up_available, xd->left_available,
-                             0 /*xd->right_available*/);
-}
 
-void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd,
-                                       BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bw = 2 << bwl;
-  const int bhl = b_height_log2(bsize), bh = 2 << bhl;
-
-  vp9_build_intra_predictors(xd->plane[1].dst.buf, xd->plane[1].dst.stride,
-                             xd->plane[1].dst.buf, xd->plane[1].dst.stride,
-                             xd->mode_info_context->mbmi.uv_mode,
-                             bw, bh, xd->up_available,
-                             xd->left_available, 0 /*xd->right_available*/);
-  vp9_build_intra_predictors(xd->plane[2].dst.buf, xd->plane[1].dst.stride,
-                             xd->plane[2].dst.buf, xd->plane[1].dst.stride,
-                             xd->mode_info_context->mbmi.uv_mode,
-                             bw, bh, xd->up_available,
-                             xd->left_available, 0 /*xd->right_available*/);
+  if (mode == DC_PRED) {
+    dc_pred[left_available][up_available][txsz](pred_ptr, stride,
+                                                above_row, left_col);
+  } else {
+    pred[mode][txsz](pred_ptr, stride, above_row, left_col);
+  }
 }
 
 void vp9_predict_intra_block(MACROBLOCKD *xd,
@@ -329,29 +406,19 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
                             int bwl_in,
                             TX_SIZE tx_size,
                             int mode,
+                            uint8_t *reference, int ref_stride,
                             uint8_t *predictor, int pre_stride) {
   const int bwl = bwl_in - tx_size;
   const int wmask = (1 << bwl) - 1;
   const int have_top = (block_idx >> bwl) || xd->up_available;
   const int have_left = (block_idx & wmask) || xd->left_available;
   const int have_right = ((block_idx & wmask) != wmask);
-  const int txfm_block_size = 4 << tx_size;
 
   assert(bwl >= 0);
-  vp9_build_intra_predictors(predictor, pre_stride,
-                             predictor, pre_stride,
-                             mode,
-                             txfm_block_size,
-                             txfm_block_size,
-                             have_top, have_left,
-                             have_right);
-}
-
-void vp9_intra4x4_predict(MACROBLOCKD *xd,
-                          int block_idx,
-                          BLOCK_SIZE_TYPE bsize,
-                          int mode,
-                          uint8_t *predictor, int pre_stride) {
-  vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize), TX_4X4,
-                          mode, predictor, pre_stride);
+  build_intra_predictors(reference, ref_stride,
+                         predictor, pre_stride,
+                         mode,
+                         tx_size,
+                         have_top, have_left,
+                         have_right);
 }
diff --git a/libvpx/vp9/common/vp9_reconintra.h b/libvpx/vp9/common/vp9_reconintra.h
index f5f5f42..e369a71 100644
--- a/libvpx/vp9/common/vp9_reconintra.h
+++ b/libvpx/vp9/common/vp9_reconintra.h
@@ -25,6 +25,6 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
                             int block_idx,
                             int bwl_in,
                             TX_SIZE tx_size,
-                            int mode,
+                            int mode, uint8_t *ref, int ref_stride,
                             uint8_t *predictor, int pre_stride);
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh
index a405aab..c357ef6 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.sh
+++ b/libvpx/vp9/common/vp9_rtcd_defs.sh
@@ -22,6 +22,8 @@ EOF
 }
 forward_decls vp9_common_forward_decls
 
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
+
 #
 # Dequant
 #
@@ -35,46 +37,177 @@ specialize vp9_idct_add_8x8
 prototype void vp9_idct_add "int16_t *input, uint8_t *dest, int stride, int eob"
 specialize vp9_idct_add
 
-
-
 prototype void vp9_idct_add_32x32 "int16_t *q, uint8_t *dst, int stride, int eob"
 specialize vp9_idct_add_32x32
 
 #
 # RECON
 #
-prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem16x16 mmx sse2 dspr2
-vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
+prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d27_predictor_4x4
+
+prototype void vp9_d45_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d45_predictor_4x4
+
+prototype void vp9_d63_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d63_predictor_4x4
+
+prototype void vp9_h_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_h_predictor_4x4 ssse3
+
+prototype void vp9_d117_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d117_predictor_4x4
+
+prototype void vp9_d135_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d135_predictor_4x4
+
+prototype void vp9_d153_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d153_predictor_4x4
+
+prototype void vp9_v_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_v_predictor_4x4 sse
+
+prototype void vp9_tm_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_tm_predictor_4x4 sse
+
+prototype void vp9_dc_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_predictor_4x4 sse
+
+prototype void vp9_dc_top_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_top_predictor_4x4
+
+prototype void vp9_dc_left_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_left_predictor_4x4
+
+prototype void vp9_dc_128_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_128_predictor_4x4
+
+prototype void vp9_d27_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d27_predictor_8x8
+
+prototype void vp9_d45_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d45_predictor_8x8
+
+prototype void vp9_d63_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d63_predictor_8x8
+
+prototype void vp9_h_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_h_predictor_8x8 ssse3
+
+prototype void vp9_d117_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d117_predictor_8x8
+
+prototype void vp9_d135_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d135_predictor_8x8
+
+prototype void vp9_d153_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d153_predictor_8x8
+
+prototype void vp9_v_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_v_predictor_8x8 sse
+
+prototype void vp9_tm_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_tm_predictor_8x8 sse2
+
+prototype void vp9_dc_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_predictor_8x8 sse
+
+prototype void vp9_dc_top_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_top_predictor_8x8
+
+prototype void vp9_dc_left_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_left_predictor_8x8
+
+prototype void vp9_dc_128_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_128_predictor_8x8
+
+prototype void vp9_d27_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d27_predictor_16x16
+
+prototype void vp9_d45_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d45_predictor_16x16
+
+prototype void vp9_d63_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d63_predictor_16x16
+
+prototype void vp9_h_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_h_predictor_16x16 ssse3
+
+prototype void vp9_d117_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d117_predictor_16x16
+
+prototype void vp9_d135_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d135_predictor_16x16
+
+prototype void vp9_d153_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d153_predictor_16x16
 
-prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem8x8 mmx dspr2
-vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
+prototype void vp9_v_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_v_predictor_16x16 sse2
 
-prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem8x4 mmx
+prototype void vp9_tm_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_tm_predictor_16x16 sse2
 
-prototype void vp9_build_intra_predictors "uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available"
-specialize void vp9_build_intra_predictors
+prototype void vp9_dc_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_predictor_16x16 sse2
 
-prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
-specialize vp9_build_intra_predictors_sby_s
+prototype void vp9_dc_top_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_top_predictor_16x16
 
-prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
-specialize vp9_build_intra_predictors_sbuv_s
+prototype void vp9_dc_left_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_left_predictor_16x16
 
-prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride"
-specialize vp9_intra4x4_predict;
+prototype void vp9_dc_128_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_128_predictor_16x16
+
+prototype void vp9_d27_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d27_predictor_32x32
+
+prototype void vp9_d45_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d45_predictor_32x32
+
+prototype void vp9_d63_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d63_predictor_32x32
+
+prototype void vp9_h_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_h_predictor_32x32 ssse3
+
+prototype void vp9_d117_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d117_predictor_32x32
+
+prototype void vp9_d135_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d135_predictor_32x32
+
+prototype void vp9_d153_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d153_predictor_32x32
+
+prototype void vp9_v_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_v_predictor_32x32 sse2
+
+prototype void vp9_tm_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_tm_predictor_32x32 sse2_x86_64
+
+prototype void vp9_dc_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_predictor_32x32 sse2
+
+prototype void vp9_dc_top_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_top_predictor_32x32
+
+prototype void vp9_dc_left_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_left_predictor_32x32
+
+prototype void vp9_dc_128_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_128_predictor_32x32
 
 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_8x8 sse2
+specialize vp9_add_constant_residual_8x8 sse2 neon
 
 prototype void vp9_add_constant_residual_16x16 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_16x16 sse2
+specialize vp9_add_constant_residual_16x16 sse2 neon
 
 prototype void vp9_add_constant_residual_32x32 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_32x32 sse2
+specialize vp9_add_constant_residual_32x32 sse2 neon
 fi
 
 #
@@ -84,19 +217,19 @@ prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t
 specialize vp9_mb_lpf_vertical_edge_w sse2
 
 prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mbloop_filter_vertical_edge sse2
+specialize vp9_mbloop_filter_vertical_edge sse2 neon
 
 prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_vertical_edge mmx
+specialize vp9_loop_filter_vertical_edge mmx neon
 
-prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
+prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mb_lpf_horizontal_edge_w sse2
 
 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mbloop_filter_horizontal_edge sse2
+specialize vp9_mbloop_filter_horizontal_edge sse2 neon
 
 prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_horizontal_edge mmx
+specialize vp9_loop_filter_horizontal_edge mmx neon
 
 #
 # post proc
@@ -131,35 +264,41 @@ specialize vp9_blend_b
 #
 # Sub Pixel Filters
 #
-prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 ssse3
+prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve_copy sse2
+
+prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve_avg sse2
 
-prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_horiz ssse3
+prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8 ssse3 neon
 
-prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_vert ssse3
+prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_horiz ssse3 neon
 
-prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg ssse3
+prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_vert ssse3 neon
 
-prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_horiz ssse3
+prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_avg ssse3 neon
 
-prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_vert ssse3
+prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_avg_horiz ssse3 neon
+
+prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_avg_vert ssse3 neon
 
 #
 # dct
 #
 prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_1_add
+specialize vp9_short_idct4x4_1_add sse2
 
 prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct4x4_add sse2
 
 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_add sse2
+specialize vp9_short_idct8x8_add sse2 neon
 
 prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_8x8_add sse2
@@ -186,21 +325,18 @@ prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int de
 specialize vp9_short_idct10_32x32_add
 
 prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht4x4_add
+specialize vp9_short_iht4x4_add sse2
 
 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht8x8_add
+specialize vp9_short_iht8x8_add sse2
 
 prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
-specialize vp9_short_iht16x16_add
+specialize vp9_short_iht16x16_add sse2
 
 prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
 specialize vp9_idct4_1d sse2
 # dct and add
 
-prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
-specialize vp9_dc_only_idct_add sse2
-
 prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_iwalsh4x4_1_add
 
@@ -220,8 +356,6 @@ if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then
 
 
 # variance
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
-
 prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance32x16 sse2
 
@@ -266,88 +400,84 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid
 specialize vp9_variance4x4 mmx sse2
 
 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64 sse2
+specialize vp9_sub_pixel_variance64x64 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x64
+specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x64
+specialize vp9_sub_pixel_variance32x64 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x64
+specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x32
+specialize vp9_sub_pixel_variance64x32 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x32
+specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x16
+specialize vp9_sub_pixel_variance32x16 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x16
+specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x32
+specialize vp9_sub_pixel_variance16x32 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x32
+specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32 sse2
+specialize vp9_sub_pixel_variance32x32 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x32
+specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
+specialize vp9_sub_pixel_variance16x16 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x16
+specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x16 sse2 mmx
-vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
+specialize vp9_sub_pixel_variance8x16 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x16
+specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
+specialize vp9_sub_pixel_variance16x8 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x8
+specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x8 sse2 mmx
-vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
+specialize vp9_sub_pixel_variance8x8 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x8
+specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3
 
 # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
 prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x4
+specialize vp9_sub_pixel_variance8x4 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x4
+specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x8
+specialize vp9_sub_pixel_variance4x8 sse ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x8
+specialize vp9_sub_pixel_avg_variance4x8 sse ssse3
 
 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x4 sse2 mmx
-vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
+specialize vp9_sub_pixel_variance4x4 sse ssse3
+#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
 
 prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x4
+specialize vp9_sub_pixel_avg_variance4x4 sse ssse3
 
 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad64x64 sse2
@@ -379,7 +509,6 @@ specialize vp9_sad8x16 mmx sse2
 prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad8x8 mmx sse2
 
-# TODO(jingning): need to covert these functions into mmx/sse2 form
 prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
 specialize vp9_sad8x4 sse2
 
@@ -389,16 +518,55 @@ specialize vp9_sad4x8 sse
 prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad4x4 mmx sse
 
+prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad64x64_avg sse2
+
+prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x64_avg sse2
+
+prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad64x32_avg sse2
+
+prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x16_avg sse2
+
+prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x32_avg sse2
+
+prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x32_avg sse2
+
+prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x16_avg sse2
+
+prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x8_avg sse2
+
+prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x16_avg sse2
+
+prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x8_avg sse2
+
+prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x4_avg sse2
+
+prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad4x8_avg sse
+
+prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad4x4_avg sse
+
 prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_h mmx sse2
+specialize vp9_variance_halfpixvar16x16_h sse2
 vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
 
 prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_v mmx sse2
+specialize vp9_variance_halfpixvar16x16_v sse2
 vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
 
 prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_hv mmx sse2
+specialize vp9_variance_halfpixvar16x16_hv sse2
 vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
 
 prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
@@ -507,8 +675,8 @@ specialize vp9_sad4x8x4d sse
 prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x4d sse
 
-prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
-specialize vp9_sub_pixel_mse16x16 sse2 mmx
+#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
+#specialize vp9_sub_pixel_mse16x16 sse2 mmx
 
 prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
 specialize vp9_mse16x16 mmx sse2
@@ -533,9 +701,19 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *"
 specialize vp9_get_mb_ss mmx sse2
 # ENCODEMB INVOKE
 
-prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"
-specialize vp9_block_error mmx sse2
-vp9_block_error_sse2=vp9_block_error_xmm
+prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
+specialize vp9_block_error sse2
+
+prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
+specialize vp9_subtract_block sse2
+
+[ $arch = "x86_64" ] && ssse3_x86_64=ssse3
+
+prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+specialize vp9_quantize_b $ssse3_x86_64
+
+prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+specialize vp9_quantize_b_32x32 $ssse3_x86_64
 
 #
 # Structured Similarity (SSIM)
@@ -552,13 +730,13 @@ fi
 
 # fdct functions
 prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
-specialize vp9_short_fht4x4
+specialize vp9_short_fht4x4 sse2
 
 prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
-specialize vp9_short_fht8x8
+specialize vp9_short_fht8x8 sse2
 
 prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
-specialize vp9_short_fht16x16
+specialize vp9_short_fht16x16 sse2
 
 prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct8x8 sse2
@@ -573,7 +751,7 @@ prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int
 specialize vp9_short_fdct32x32
 
 prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct32x32_rd
+specialize vp9_short_fdct32x32_rd sse2
 
 prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct16x16 sse2
diff --git a/libvpx/vp9/common/vp9_seg_common.c b/libvpx/vp9/common/vp9_seg_common.c
index df7747c..6bfd8f8 100644
--- a/libvpx/vp9/common/vp9_seg_common.c
+++ b/libvpx/vp9/common/vp9_seg_common.c
@@ -9,36 +9,41 @@
  */
 
 #include <assert.h>
+
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_quant_common.h"
 
 static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 };
-static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, 63, 3, 0 };
+
+static const int seg_feature_data_max[SEG_LVL_MAX] = {
+  MAXQ, MAX_LOOP_FILTER, 3, 0 };
 
 // These functions provide access to new segment level features.
 // Eventually these function may be "optimized out" but for the moment,
 // the coding mechanism is still subject to change so these provide a
 // convenient single point of change.
 
-int vp9_segfeature_active(const MACROBLOCKD *xd, int segment_id,
+int vp9_segfeature_active(const struct segmentation *seg, int segment_id,
                           SEG_LVL_FEATURES feature_id) {
-  return xd->segmentation_enabled &&
-         (xd->segment_feature_mask[segment_id] & (1 << feature_id));
+  return seg->enabled &&
+         (seg->feature_mask[segment_id] & (1 << feature_id));
 }
 
-void vp9_clearall_segfeatures(MACROBLOCKD *xd) {
-  vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
-  vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask));
+void vp9_clearall_segfeatures(struct segmentation *seg) {
+  vp9_zero(seg->feature_data);
+  vp9_zero(seg->feature_mask);
 }
 
-void vp9_enable_segfeature(MACROBLOCKD *xd, int segment_id,
+void vp9_enable_segfeature(struct segmentation *seg, int segment_id,
                            SEG_LVL_FEATURES feature_id) {
-  xd->segment_feature_mask[segment_id] |= 1 << feature_id;
+  seg->feature_mask[segment_id] |= 1 << feature_id;
 }
 
-void vp9_disable_segfeature(MACROBLOCKD *xd, int segment_id,
+void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
                             SEG_LVL_FEATURES feature_id) {
-  xd->segment_feature_mask[segment_id] &= ~(1 << feature_id);
+  seg->feature_mask[segment_id] &= ~(1 << feature_id);
 }
 
 int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id) {
@@ -49,12 +54,12 @@ int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
   return seg_feature_data_signed[feature_id];
 }
 
-void vp9_clear_segdata(MACROBLOCKD *xd, int segment_id,
+void vp9_clear_segdata(struct segmentation *seg, int segment_id,
                        SEG_LVL_FEATURES feature_id) {
-  xd->segment_feature_data[segment_id][feature_id] = 0;
+  seg->feature_data[segment_id][feature_id] = 0;
 }
 
-void vp9_set_segdata(MACROBLOCKD *xd, int segment_id,
+void vp9_set_segdata(struct segmentation *seg, int segment_id,
                      SEG_LVL_FEATURES feature_id, int seg_data) {
   assert(seg_data <= seg_feature_data_max[feature_id]);
   if (seg_data < 0) {
@@ -62,12 +67,12 @@ void vp9_set_segdata(MACROBLOCKD *xd, int segment_id,
     assert(-seg_data <= seg_feature_data_max[feature_id]);
   }
 
-  xd->segment_feature_data[segment_id][feature_id] = seg_data;
+  seg->feature_data[segment_id][feature_id] = seg_data;
 }
 
-int vp9_get_segdata(const MACROBLOCKD *xd, int segment_id,
+int vp9_get_segdata(const struct segmentation *seg, int segment_id,
                     SEG_LVL_FEATURES feature_id) {
-  return xd->segment_feature_data[segment_id][feature_id];
+  return seg->feature_data[segment_id][feature_id];
 }
 
 
diff --git a/libvpx/vp9/common/vp9_seg_common.h b/libvpx/vp9/common/vp9_seg_common.h
index 74ba03c..f22239b 100644
--- a/libvpx/vp9/common/vp9_seg_common.h
+++ b/libvpx/vp9/common/vp9_seg_common.h
@@ -8,23 +8,54 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_blockd.h"
-
 #ifndef VP9_COMMON_VP9_SEG_COMMON_H_
 #define VP9_COMMON_VP9_SEG_COMMON_H_
 
-int vp9_segfeature_active(const MACROBLOCKD *xd,
+#include "vp9/common/vp9_treecoder.h"
+
+#define SEGMENT_DELTADATA   0
+#define SEGMENT_ABSDATA     1
+
+#define MAX_SEGMENTS     8
+#define SEG_TREE_PROBS   (MAX_SEGMENTS-1)
+
+#define PREDICTION_PROBS 3
+
+// Segment level features.
+typedef enum {
+  SEG_LVL_ALT_Q = 0,               // Use alternate Quantizer ....
+  SEG_LVL_ALT_LF = 1,              // Use alternate loop filter value...
+  SEG_LVL_REF_FRAME = 2,           // Optional Segment reference frame
+  SEG_LVL_SKIP = 3,                // Optional Segment (0,0) + skip mode
+  SEG_LVL_MAX = 4                  // Number of features supported
+} SEG_LVL_FEATURES;
+
+
+struct segmentation {
+  uint8_t enabled;
+  uint8_t update_map;
+  uint8_t update_data;
+  uint8_t abs_delta;
+  uint8_t temporal_update;
+
+  vp9_prob tree_probs[SEG_TREE_PROBS];
+  vp9_prob pred_probs[PREDICTION_PROBS];
+
+  int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
+  unsigned int feature_mask[MAX_SEGMENTS];
+};
+
+int vp9_segfeature_active(const struct segmentation *seg,
                           int segment_id,
                           SEG_LVL_FEATURES feature_id);
 
-void vp9_clearall_segfeatures(MACROBLOCKD *xd);
+void vp9_clearall_segfeatures(struct segmentation *seg);
 
-void vp9_enable_segfeature(MACROBLOCKD *xd,
+void vp9_enable_segfeature(struct segmentation *seg,
                            int segment_id,
                            SEG_LVL_FEATURES feature_id);
 
-void vp9_disable_segfeature(MACROBLOCKD *xd,
+void vp9_disable_segfeature(struct segmentation *seg,
                             int segment_id,
                             SEG_LVL_FEATURES feature_id);
 
@@ -32,16 +63,16 @@ int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
 
 int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
 
-void vp9_clear_segdata(MACROBLOCKD *xd,
+void vp9_clear_segdata(struct segmentation *seg,
                        int segment_id,
                        SEG_LVL_FEATURES feature_id);
 
-void vp9_set_segdata(MACROBLOCKD *xd,
+void vp9_set_segdata(struct segmentation *seg,
                      int segment_id,
                      SEG_LVL_FEATURES feature_id,
                      int seg_data);
 
-int vp9_get_segdata(const MACROBLOCKD *xd,
+int vp9_get_segdata(const struct segmentation *seg,
                     int segment_id,
                     SEG_LVL_FEATURES feature_id);
 
diff --git a/libvpx/vp9/common/vp9_tile_common.c b/libvpx/vp9/common/vp9_tile_common.c
index 95296ad..a72d2ab 100644
--- a/libvpx/vp9/common/vp9_tile_common.c
+++ b/libvpx/vp9/common/vp9_tile_common.c
@@ -10,15 +10,16 @@
 
 #include "vp9/common/vp9_tile_common.h"
 
-#define MIN_TILE_WIDTH 256
-#define MAX_TILE_WIDTH 4096
-#define MIN_TILE_WIDTH_SBS (MIN_TILE_WIDTH >> 6)
-#define MAX_TILE_WIDTH_SBS (MAX_TILE_WIDTH >> 6)
-
-static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off,
-                                 int *max_tile_off, int tile_idx,
-                                 int log2_n_tiles, int n_mis) {
-  const int n_sbs = (n_mis + 7) >> 3;
+#define MIN_TILE_WIDTH_B64 4
+#define MAX_TILE_WIDTH_B64 64
+
+static int to_sbs(n_mis) {
+  return mi_cols_aligned_to_sb(n_mis) >> LOG2_MI_BLOCK_SIZE;
+}
+
+static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off,
+                                 int tile_idx, int log2_n_tiles, int n_mis) {
+  const int n_sbs = to_sbs(n_mis);
   const int sb_off1 =  (tile_idx      * n_sbs) >> log2_n_tiles;
   const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
 
@@ -27,37 +28,34 @@ static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off,
 }
 
 void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) {
-  cm->cur_tile_col_idx = tile_col_idx;
-  vp9_get_tile_offsets(cm, &cm->cur_tile_mi_col_start,
-                       &cm->cur_tile_mi_col_end, tile_col_idx,
-                       cm->log2_tile_columns, cm->mi_cols);
+  vp9_get_tile_offsets(&cm->cur_tile_mi_col_start, &cm->cur_tile_mi_col_end,
+                       tile_col_idx, cm->log2_tile_cols, cm->mi_cols);
 }
 
 void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) {
-  cm->cur_tile_row_idx = tile_row_idx;
-  vp9_get_tile_offsets(cm, &cm->cur_tile_mi_row_start,
-                       &cm->cur_tile_mi_row_end, tile_row_idx,
-                       cm->log2_tile_rows, cm->mi_rows);
+  vp9_get_tile_offsets(&cm->cur_tile_mi_row_start, &cm->cur_tile_mi_row_end,
+                       tile_row_idx, cm->log2_tile_rows, cm->mi_rows);
 }
 
 
-void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles_ptr,
-                         int *delta_log2_n_tiles) {
-  const int sb_cols = (cm->mb_cols + 3) >> 2;
+void vp9_get_tile_n_bits(int mi_cols,
+                         int *min_log2_tile_cols, int *max_log2_tile_cols) {
+  const int sb_cols = to_sbs(mi_cols);
   int min_log2_n_tiles, max_log2_n_tiles;
 
   for (max_log2_n_tiles = 0;
-       (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_SBS;
+       (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_B64;
        max_log2_n_tiles++) {}
   max_log2_n_tiles--;
   if (max_log2_n_tiles <  0)
     max_log2_n_tiles = 0;
 
   for (min_log2_n_tiles = 0;
-       (MAX_TILE_WIDTH_SBS << min_log2_n_tiles) < sb_cols;
+       (MAX_TILE_WIDTH_B64 << min_log2_n_tiles) < sb_cols;
        min_log2_n_tiles++) {}
 
-  assert(max_log2_n_tiles >= min_log2_n_tiles);
-  *min_log2_n_tiles_ptr = min_log2_n_tiles;
-  *delta_log2_n_tiles = max_log2_n_tiles - min_log2_n_tiles;
+  assert(min_log2_n_tiles <= max_log2_n_tiles);
+
+  *min_log2_tile_cols = min_log2_n_tiles;
+  *max_log2_tile_cols = max_log2_n_tiles;
 }
diff --git a/libvpx/vp9/common/vp9_tile_common.h b/libvpx/vp9/common/vp9_tile_common.h
index 7ea3772..6d14560 100644
--- a/libvpx/vp9/common/vp9_tile_common.h
+++ b/libvpx/vp9/common/vp9_tile_common.h
@@ -17,7 +17,7 @@ void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx);
 
 void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx);
 
-void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles,
-                         int *delta_log2_n_tiles);
+void vp9_get_tile_n_bits(int mi_cols,
+                         int *min_log2_tile_cols, int *max_log2_tile_cols);
 
 #endif  // VP9_COMMON_VP9_TILE_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_treecoder.c b/libvpx/vp9/common/vp9_treecoder.c
index 531fa75..2e21a5b 100644
--- a/libvpx/vp9/common/vp9_treecoder.c
+++ b/libvpx/vp9/common/vp9_treecoder.c
@@ -9,12 +9,9 @@
  */
 
 
-#include "vpx_config.h"
-
-#if defined(CONFIG_DEBUG) && CONFIG_DEBUG
 #include <assert.h>
-#endif
 
+#include "./vpx_config.h"
 #include "vp9/common/vp9_treecoder.h"
 
 static void tree2tok(struct vp9_token *const p, vp9_tree t,
diff --git a/libvpx/vp9/common/x86/vp9_asm_stubs.c b/libvpx/vp9/common/x86/vp9_asm_stubs.c
index 2b66834..3f1c198 100644
--- a/libvpx/vp9/common/x86/vp9_asm_stubs.c
+++ b/libvpx/vp9/common/x86/vp9_asm_stubs.c
@@ -121,11 +121,12 @@ void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
                                      unsigned int output_height,
                                      const short *filter);
 
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
+void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h) {
+  /* Ensure the filter can be compressed to int16_t. */
   if (x_step_q4 == 16 && filter_x[3] != 128) {
     while (w >= 16) {
       vp9_filter_block1d16_h8_ssse3(src, src_stride,
@@ -159,8 +160,8 @@ void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
   }
 }
 
-void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
+void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
@@ -197,8 +198,8 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
   }
 }
 
-void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h) {
@@ -235,8 +236,8 @@ void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
   }
 }
 
-void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
@@ -273,8 +274,8 @@ void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
   }
 }
 
-void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
+void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
@@ -294,8 +295,8 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
   }
 }
 
-void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
diff --git a/libvpx/vp9/common/x86/vp9_copy_sse2.asm b/libvpx/vp9/common/x86/vp9_copy_sse2.asm
new file mode 100644
index 0000000..dd522c6
--- /dev/null
+++ b/libvpx/vp9/common/x86/vp9_copy_sse2.asm
@@ -0,0 +1,152 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1
+INIT_XMM sse2
+cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
+                              fx, fxs, fy, fys, w, h
+  mov r4d, dword wm
+  cmp r4d, 4
+  je .w4
+  cmp r4d, 8
+  je .w8
+  cmp r4d, 16
+  je .w16
+  cmp r4d, 32
+  je .w32
+
+  mov                    r4d, dword hm
+.loop64:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+16]
+  pavgb                   m2, [dstq+32]
+  pavgb                   m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  add                   dstq, dst_strideq
+  dec                    r4d
+  jnz .loop64
+  RET
+
+.w32:
+  mov                    r4d, dword hm
+.loop32:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+src_strideq]
+  movu                    m3, [srcq+src_strideq+16]
+  lea                   srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq            +16]
+  pavgb                   m2, [dstq+dst_strideq]
+  pavgb                   m3, [dstq+dst_strideq+16]
+%endif
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq+dst_strideq   ], m2
+  mova [dstq+dst_strideq+16], m3
+  lea                   dstq, [dstq+dst_strideq*2]
+  sub                    r4d, 2
+  jnz .loop32
+  RET
+
+.w16:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop16:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+src_strideq]
+  movu                    m2, [srcq+src_strideq*2]
+  movu                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+dst_strideq]
+  pavgb                   m2, [dstq+dst_strideq*2]
+  pavgb                   m3, [dstq+r6q]
+%endif
+  mova  [dstq              ], m0
+  mova  [dstq+dst_strideq  ], m1
+  mova  [dstq+dst_strideq*2], m2
+  mova  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop16
+  RET
+
+INIT_MMX sse
+.w8:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop8:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+src_strideq]
+  movu                    m2, [srcq+src_strideq*2]
+  movu                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+dst_strideq]
+  pavgb                   m2, [dstq+dst_strideq*2]
+  pavgb                   m3, [dstq+r6q]
+%endif
+  mova  [dstq              ], m0
+  mova  [dstq+dst_strideq  ], m1
+  mova  [dstq+dst_strideq*2], m2
+  mova  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop8
+  RET
+
+.w4:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop4:
+  movh                    m0, [srcq]
+  movh                    m1, [srcq+src_strideq]
+  movh                    m2, [srcq+src_strideq*2]
+  movh                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+dst_strideq]
+  pavgb                   m2, [dstq+dst_strideq*2]
+  pavgb                   m3, [dstq+r6q]
+%endif
+  movh  [dstq              ], m0
+  movh  [dstq+dst_strideq  ], m1
+  movh  [dstq+dst_strideq*2], m2
+  movh  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop4
+  RET
+%endmacro
+
+convolve_fn copy
+convolve_fn avg
diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index 599dcff..a1e14b4 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -15,64 +15,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-// In order to improve performance, clip absolute diff values to [0, 255],
-// which allows to keep the additions/subtractions in 8 bits.
-void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
-                               uint8_t *dst_ptr, int pitch, int stride) {
-  int a1;
-  int16_t out;
-  uint8_t abs_diff;
-  __m128i p0, p1, p2, p3;
-  unsigned int extended_diff;
-  __m128i diff;
-
-  out = dct_const_round_shift(input_dc * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  // Read prediction data.
-  p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch));
-  p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch));
-  p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch));
-  p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch));
-
-  // Unpack prediction data, and store 4x4 array in 1 XMM register.
-  p0 = _mm_unpacklo_epi32(p0, p1);
-  p2 = _mm_unpacklo_epi32(p2, p3);
-  p0 = _mm_unpacklo_epi64(p0, p2);
-
-  // Clip dc value to [0, 255] range. Then, do addition or subtraction
-  // according to its sign.
-  if (a1 >= 0) {
-    abs_diff = (a1 > 255) ? 255 : a1;
-    extended_diff = abs_diff * 0x01010101u;
-    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
-
-    p1 = _mm_adds_epu8(p0, diff);
-  } else {
-    abs_diff = (a1 < -255) ? 255 : -a1;
-    extended_diff = abs_diff * 0x01010101u;
-    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
-
-    p1 = _mm_subs_epu8(p0, diff);
-  }
-
-  // Store results to dst.
-  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
-  dst_ptr += stride;
-
-  p1 = _mm_srli_si128(p1, 4);
-  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
-  dst_ptr += stride;
-
-  p1 = _mm_srli_si128(p1, 4);
-  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
-  dst_ptr += stride;
-
-  p1 = _mm_srli_si128(p1, 4);
-  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
-}
-
 void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
@@ -206,6 +148,23 @@ void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE4X4(dest, input3);
 }
 
+void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 4);
+
+  dc_value = _mm_set1_epi16(a);
+
+  RECON_AND_STORE4X4(dest, dc_value);
+  RECON_AND_STORE4X4(dest, dc_value);
+  RECON_AND_STORE4X4(dest, dc_value);
+  RECON_AND_STORE4X4(dest, dc_value);
+}
+
 void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -241,6 +200,155 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
   _mm_storel_epi64((__m128i *)output, in);
 }
 
+static INLINE void transpose_4x4(__m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
+  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+void idct4_1d_sse2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+
+  transpose_4x4(in);
+  // stage 1
+  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
+  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[2]);
+  u[1] = _mm_packs_epi32(v[1], v[3]);
+  u[2] = _mm_unpackhi_epi64(u[0], u[0]);
+  u[3] = _mm_unpackhi_epi64(u[1], u[1]);
+
+  // stage 2
+  in[0] = _mm_add_epi16(u[0], u[3]);
+  in[1] = _mm_add_epi16(u[1], u[2]);
+  in[2] = _mm_sub_epi16(u[1], u[2]);
+  in[3] = _mm_sub_epi16(u[0], u[3]);
+}
+
+void iadst4_1d_sse2(__m128i *in) {
+  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
+  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8], in7;
+
+  transpose_4x4(in);
+  in7 = _mm_add_epi16(in[0], in[3]);
+  in7 = _mm_sub_epi16(in7, in[2]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
+  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpacklo_epi16(in[1], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(v[3], v[4]);
+  u[2] = v[2];
+  u[3] = _mm_add_epi32(u[0], u[1]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_add_epi32(u[3], v[5]);
+  u[6] = _mm_sub_epi32(u[5], u[4]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[2]);
+  in[1] = _mm_packs_epi32(u[1], u[3]);
+  in[2] = _mm_unpackhi_epi64(in[0], in[0]);
+  in[3] = _mm_unpackhi_epi64(in[1], in[1]);
+}
+
+void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
+                               int tx_type) {
+  __m128i in[4];
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i eight = _mm_set1_epi16(8);
+
+  in[0] = _mm_loadl_epi64((__m128i *)input);
+  in[1] = _mm_loadl_epi64((__m128i *)(input + 4));
+  in[2] = _mm_loadl_epi64((__m128i *)(input + 8));
+  in[3] = _mm_loadl_epi64((__m128i *)(input + 12));
+
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      idct4_1d_sse2(in);
+      idct4_1d_sse2(in);
+      break;
+    case 1:  // ADST_DCT
+      idct4_1d_sse2(in);
+      iadst4_1d_sse2(in);
+      break;
+    case 2:  // DCT_ADST
+      iadst4_1d_sse2(in);
+      idct4_1d_sse2(in);
+      break;
+    case 3:  // ADST_ADST
+      iadst4_1d_sse2(in);
+      iadst4_1d_sse2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  // Final round and shift
+  in[0] = _mm_add_epi16(in[0], eight);
+  in[1] = _mm_add_epi16(in[1], eight);
+  in[2] = _mm_add_epi16(in[2], eight);
+  in[3] = _mm_add_epi16(in[3], eight);
+
+  in[0] = _mm_srai_epi16(in[0], 4);
+  in[1] = _mm_srai_epi16(in[1], 4);
+  in[2] = _mm_srai_epi16(in[2], 4);
+  in[3] = _mm_srai_epi16(in[3], 4);
+
+  RECON_AND_STORE4X4(dest, in[0]);
+  RECON_AND_STORE4X4(dest, in[1]);
+  RECON_AND_STORE4X4(dest, in[2]);
+  RECON_AND_STORE4X4(dest, in[3]);
+}
+
 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
                       out0, out1, out2, out3, out4, out5, out6, out7) \
   {                                                     \
@@ -489,6 +597,373 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest, in7);
 }
 
+// perform 8x8 transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+void idct8_1d_sse2(__m128i *in) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  in0 = in[0];
+  in1 = in[1];
+  in2 = in[2];
+  in3 = in[3];
+  in4 = in[4];
+  in5 = in[5];
+  in6 = in[6];
+  in7 = in[7];
+
+  // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
+  TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                in4, in5, in6, in7);
+
+  // 4-stage 1D idct8x8
+  IDCT8x8_1D
+  in[0] = in0;
+  in[1] = in1;
+  in[2] = in2;
+  in[3] = in3;
+  in[4] = in4;
+  in[5] = in5;
+  in[6] = in6;
+  in[7] = in7;
+}
+
+void iadst8_1d_sse2(__m128i *in) {
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // transpose
+  array_transpose_8x8(in, in);
+
+  // properly aligned for butterfly input
+  in0  = in[7];
+  in1  = in[0];
+  in2  = in[5];
+  in3  = in[2];
+  in4  = in[3];
+  in5  = in[4];
+  in6  = in[1];
+  in7  = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+}
+
+
+void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
+                               int tx_type) {
+  __m128i in[8];
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i final_rounding = _mm_set1_epi16(1<<4);
+
+  // load input data
+  in[0] = _mm_load_si128((__m128i *)input);
+  in[1] = _mm_load_si128((__m128i *)(input + 8 * 1));
+  in[2] = _mm_load_si128((__m128i *)(input + 8 * 2));
+  in[3] = _mm_load_si128((__m128i *)(input + 8 * 3));
+  in[4] = _mm_load_si128((__m128i *)(input + 8 * 4));
+  in[5] = _mm_load_si128((__m128i *)(input + 8 * 5));
+  in[6] = _mm_load_si128((__m128i *)(input + 8 * 6));
+  in[7] = _mm_load_si128((__m128i *)(input + 8 * 7));
+
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      idct8_1d_sse2(in);
+      idct8_1d_sse2(in);
+      break;
+    case 1:  // ADST_DCT
+      idct8_1d_sse2(in);
+      iadst8_1d_sse2(in);
+      break;
+    case 2:  // DCT_ADST
+      iadst8_1d_sse2(in);
+      idct8_1d_sse2(in);
+      break;
+    case 3:  // ADST_ADST
+      iadst8_1d_sse2(in);
+      iadst8_1d_sse2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 5);
+  in[1] = _mm_srai_epi16(in[1], 5);
+  in[2] = _mm_srai_epi16(in[2], 5);
+  in[3] = _mm_srai_epi16(in[3], 5);
+  in[4] = _mm_srai_epi16(in[4], 5);
+  in[5] = _mm_srai_epi16(in[5], 5);
+  in[6] = _mm_srai_epi16(in[6], 5);
+  in[7] = _mm_srai_epi16(in[7], 5);
+
+  RECON_AND_STORE(dest, in[0]);
+  RECON_AND_STORE(dest, in[1]);
+  RECON_AND_STORE(dest, in[2]);
+  RECON_AND_STORE(dest, in[3]);
+  RECON_AND_STORE(dest, in[4]);
+  RECON_AND_STORE(dest, in[5]);
+  RECON_AND_STORE(dest, in[6]);
+  RECON_AND_STORE(dest, in[7]);
+}
+
 void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -974,6 +1449,960 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8(res0, res0);
+  array_transpose_8x8(res1, tbuf);
+  array_transpose_8x8(res0 + 8, res1);
+  array_transpose_8x8(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+void iadst16_1d_8col(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+void idct16_1d_8col(__m128i *in) {
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i v[16], u[16], s[16], t[16];
+
+  // stage 1
+  s[0] = in[0];
+  s[1] = in[8];
+  s[2] = in[4];
+  s[3] = in[12];
+  s[4] = in[2];
+  s[5] = in[10];
+  s[6] = in[6];
+  s[7] = in[14];
+  s[8] = in[1];
+  s[9] = in[9];
+  s[10] = in[5];
+  s[11] = in[13];
+  s[12] = in[3];
+  s[13] = in[11];
+  s[14] = in[7];
+  s[15] = in[15];
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
+  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
+  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
+  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[8]  = _mm_packs_epi32(u[0], u[1]);
+  s[15] = _mm_packs_epi32(u[2], u[3]);
+  s[9]  = _mm_packs_epi32(u[4], u[5]);
+  s[14] = _mm_packs_epi32(u[6], u[7]);
+  s[10] = _mm_packs_epi32(u[8], u[9]);
+  s[13] = _mm_packs_epi32(u[10], u[11]);
+  s[11] = _mm_packs_epi32(u[12], u[13]);
+  s[12] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  t[0] = s[0];
+  t[1] = s[1];
+  t[2] = s[2];
+  t[3] = s[3];
+  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[4] = _mm_packs_epi32(u[0], u[1]);
+  t[7] = _mm_packs_epi32(u[2], u[3]);
+  t[5] = _mm_packs_epi32(u[4], u[5]);
+  t[6] = _mm_packs_epi32(u[6], u[7]);
+  t[8] = _mm_add_epi16(s[8], s[9]);
+  t[9] = _mm_sub_epi16(s[8], s[9]);
+  t[10] = _mm_sub_epi16(s[11], s[10]);
+  t[11] = _mm_add_epi16(s[10], s[11]);
+  t[12] = _mm_add_epi16(s[12], s[13]);
+  t[13] = _mm_sub_epi16(s[12], s[13]);
+  t[14] = _mm_sub_epi16(s[15], s[14]);
+  t[15] = _mm_add_epi16(s[14], s[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
+  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
+  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
+  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
+  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
+  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
+  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_add_epi16(t[4], t[5]);
+  s[5] = _mm_sub_epi16(t[4], t[5]);
+  s[6] = _mm_sub_epi16(t[7], t[6]);
+  s[7] = _mm_add_epi16(t[6], t[7]);
+  s[8] = t[8];
+  s[15] = t[15];
+  s[9]  = _mm_packs_epi32(u[8], u[9]);
+  s[14] = _mm_packs_epi32(u[10], u[11]);
+  s[10] = _mm_packs_epi32(u[12], u[13]);
+  s[13] = _mm_packs_epi32(u[14], u[15]);
+  s[11] = t[11];
+  s[12] = t[12];
+
+  // stage 5
+  t[0] = _mm_add_epi16(s[0], s[3]);
+  t[1] = _mm_add_epi16(s[1], s[2]);
+  t[2] = _mm_sub_epi16(s[1], s[2]);
+  t[3] = _mm_sub_epi16(s[0], s[3]);
+  t[4] = s[4];
+  t[7] = s[7];
+
+  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
+  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  t[5] = _mm_packs_epi32(u[0], u[1]);
+  t[6] = _mm_packs_epi32(u[2], u[3]);
+
+  t[8] = _mm_add_epi16(s[8], s[11]);
+  t[9] = _mm_add_epi16(s[9], s[10]);
+  t[10] = _mm_sub_epi16(s[9], s[10]);
+  t[11] = _mm_sub_epi16(s[8], s[11]);
+  t[12] = _mm_sub_epi16(s[15], s[12]);
+  t[13] = _mm_sub_epi16(s[14], s[13]);
+  t[14] = _mm_add_epi16(s[13], s[14]);
+  t[15] = _mm_add_epi16(s[12], s[15]);
+
+  // stage 6
+  s[0] = _mm_add_epi16(t[0], t[7]);
+  s[1] = _mm_add_epi16(t[1], t[6]);
+  s[2] = _mm_add_epi16(t[2], t[5]);
+  s[3] = _mm_add_epi16(t[3], t[4]);
+  s[4] = _mm_sub_epi16(t[3], t[4]);
+  s[5] = _mm_sub_epi16(t[2], t[5]);
+  s[6] = _mm_sub_epi16(t[1], t[6]);
+  s[7] = _mm_sub_epi16(t[0], t[7]);
+  s[8] = t[8];
+  s[9] = t[9];
+
+  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  s[10] = _mm_packs_epi32(u[0], u[1]);
+  s[13] = _mm_packs_epi32(u[2], u[3]);
+  s[11] = _mm_packs_epi32(u[4], u[5]);
+  s[12] = _mm_packs_epi32(u[6], u[7]);
+  s[14] = t[14];
+  s[15] = t[15];
+
+  // stage 7
+  in[0] = _mm_add_epi16(s[0], s[15]);
+  in[1] = _mm_add_epi16(s[1], s[14]);
+  in[2] = _mm_add_epi16(s[2], s[13]);
+  in[3] = _mm_add_epi16(s[3], s[12]);
+  in[4] = _mm_add_epi16(s[4], s[11]);
+  in[5] = _mm_add_epi16(s[5], s[10]);
+  in[6] = _mm_add_epi16(s[6], s[9]);
+  in[7] = _mm_add_epi16(s[7], s[8]);
+  in[8] = _mm_sub_epi16(s[7], s[8]);
+  in[9] = _mm_sub_epi16(s[6], s[9]);
+  in[10] = _mm_sub_epi16(s[5], s[10]);
+  in[11] = _mm_sub_epi16(s[4], s[11]);
+  in[12] = _mm_sub_epi16(s[3], s[12]);
+  in[13] = _mm_sub_epi16(s[2], s[13]);
+  in[14] = _mm_sub_epi16(s[1], s[14]);
+  in[15] = _mm_sub_epi16(s[0], s[15]);
+}
+
+void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  idct16_1d_8col(in0);
+  idct16_1d_8col(in1);
+}
+
+void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  iadst16_1d_8col(in0);
+  iadst16_1d_8col(in1);
+}
+
+static INLINE void load_buffer_8x16(int16_t *input, __m128i *in) {
+  in[0]  = _mm_load_si128((__m128i *)(input + 0 * 16));
+  in[1]  = _mm_load_si128((__m128i *)(input + 1 * 16));
+  in[2]  = _mm_load_si128((__m128i *)(input + 2 * 16));
+  in[3]  = _mm_load_si128((__m128i *)(input + 3 * 16));
+  in[4]  = _mm_load_si128((__m128i *)(input + 4 * 16));
+  in[5]  = _mm_load_si128((__m128i *)(input + 5 * 16));
+  in[6]  = _mm_load_si128((__m128i *)(input + 6 * 16));
+  in[7]  = _mm_load_si128((__m128i *)(input + 7 * 16));
+
+  in[8]  = _mm_load_si128((__m128i *)(input + 8 * 16));
+  in[9]  = _mm_load_si128((__m128i *)(input + 9 * 16));
+  in[10]  = _mm_load_si128((__m128i *)(input + 10 * 16));
+  in[11]  = _mm_load_si128((__m128i *)(input + 11 * 16));
+  in[12]  = _mm_load_si128((__m128i *)(input + 12 * 16));
+  in[13]  = _mm_load_si128((__m128i *)(input + 13 * 16));
+  in[14]  = _mm_load_si128((__m128i *)(input + 14 * 16));
+  in[15]  = _mm_load_si128((__m128i *)(input + 15 * 16));
+}
+
+static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+  in[8] = _mm_adds_epi16(in[8], final_rounding);
+  in[9] = _mm_adds_epi16(in[9], final_rounding);
+  in[10] = _mm_adds_epi16(in[10], final_rounding);
+  in[11] = _mm_adds_epi16(in[11], final_rounding);
+  in[12] = _mm_adds_epi16(in[12], final_rounding);
+  in[13] = _mm_adds_epi16(in[13], final_rounding);
+  in[14] = _mm_adds_epi16(in[14], final_rounding);
+  in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 6);
+  in[1] = _mm_srai_epi16(in[1], 6);
+  in[2] = _mm_srai_epi16(in[2], 6);
+  in[3] = _mm_srai_epi16(in[3], 6);
+  in[4] = _mm_srai_epi16(in[4], 6);
+  in[5] = _mm_srai_epi16(in[5], 6);
+  in[6] = _mm_srai_epi16(in[6], 6);
+  in[7] = _mm_srai_epi16(in[7], 6);
+  in[8] = _mm_srai_epi16(in[8], 6);
+  in[9] = _mm_srai_epi16(in[9], 6);
+  in[10] = _mm_srai_epi16(in[10], 6);
+  in[11] = _mm_srai_epi16(in[11], 6);
+  in[12] = _mm_srai_epi16(in[12], 6);
+  in[13] = _mm_srai_epi16(in[13], 6);
+  in[14] = _mm_srai_epi16(in[14], 6);
+  in[15] = _mm_srai_epi16(in[15], 6);
+
+  RECON_AND_STORE(dest, in[0]);
+  RECON_AND_STORE(dest, in[1]);
+  RECON_AND_STORE(dest, in[2]);
+  RECON_AND_STORE(dest, in[3]);
+  RECON_AND_STORE(dest, in[4]);
+  RECON_AND_STORE(dest, in[5]);
+  RECON_AND_STORE(dest, in[6]);
+  RECON_AND_STORE(dest, in[7]);
+  RECON_AND_STORE(dest, in[8]);
+  RECON_AND_STORE(dest, in[9]);
+  RECON_AND_STORE(dest, in[10]);
+  RECON_AND_STORE(dest, in[11]);
+  RECON_AND_STORE(dest, in[12]);
+  RECON_AND_STORE(dest, in[13]);
+  RECON_AND_STORE(dest, in[14]);
+  RECON_AND_STORE(dest, in[15]);
+}
+
+void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride,
+                                 int tx_type) {
+  __m128i in0[16], in1[16];
+
+  load_buffer_8x16(input, in0);
+  input += 8;
+  load_buffer_8x16(input, in1);
+
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      idct16_1d_sse2(in0, in1);
+      idct16_1d_sse2(in0, in1);
+      break;
+    case 1:  // ADST_DCT
+      idct16_1d_sse2(in0, in1);
+      iadst16_1d_sse2(in0, in1);
+      break;
+    case 2:  // DCT_ADST
+      iadst16_1d_sse2(in0, in1);
+      idct16_1d_sse2(in0, in1);
+      break;
+    case 3:  // ADST_ADST
+      iadst16_1d_sse2(in0, in1);
+      iadst16_1d_sse2(in0, in1);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  write_buffer_8x16(dest, in0, stride);
+  dest += 8;
+  write_buffer_8x16(dest, in1, stride);
+}
+
 void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
                                      int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
diff --git a/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm b/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
new file mode 100644
index 0000000..980b8b9
--- /dev/null
+++ b/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
@@ -0,0 +1,341 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+
+SECTION .text
+
+INIT_MMX sse
+cglobal dc_predictor_4x4, 4, 4, 2, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m0, [aboveq]
+  punpckldq             m0, [leftq]
+  psadbw                m0, m1
+  paddw                 m0, [pw_4]
+  psraw                 m0, 3
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  RET
+
+INIT_MMX sse
+cglobal dc_predictor_8x8, 4, 4, 3, dst, stride, above, left
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  paddw                 m0, [pw_8]
+  psraw                 m0, 4
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RET
+
+INIT_XMM sse2
+cglobal dc_predictor_16x16, 4, 4, 3, dst, stride, above, left
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [pw_16]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_predictor_32x32, 4, 4, 5, dst, stride, above, left
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  psadbw                m3, m1
+  psadbw                m4, m1
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [pw_32]
+  psraw                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  REP_RET
+
+INIT_MMX sse
+cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movd                  m0, [aboveq]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  RET
+
+INIT_MMX sse
+cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m1
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m1
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m1
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m1
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_MMX sse
+cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  movd                  m0, [aboveq]
+  punpcklbw             m2, m1
+  punpcklbw             m0, m1
+  pshufw                m2, m2, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -2
+  add                leftq, 4
+  psubw                 m0, m2
+.loop:
+  movd                  m2, [leftq+lineq*2]
+  movd                  m3, [leftq+lineq*2+1]
+  punpcklbw             m2, m1
+  punpcklbw             m3, m1
+  pshufw                m2, m2, 0x0
+  pshufw                m3, m3, 0x0
+  paddw                 m2, m0
+  paddw                 m3, m0
+  packuswb              m2, m2
+  packuswb              m3, m3
+  movd      [dstq        ], m2
+  movd      [dstq+strideq], m3
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  movq                  m0, [aboveq]
+  punpcklbw             m2, m1
+  punpcklbw             m0, m1
+  pshuflw               m2, m2, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -4
+  punpcklqdq            m2, m2
+  add                leftq, 8
+  psubw                 m0, m2
+.loop:
+  movd                  m2, [leftq+lineq*2]
+  movd                  m3, [leftq+lineq*2+1]
+  punpcklbw             m2, m1
+  punpcklbw             m3, m1
+  pshuflw               m2, m2, 0x0
+  pshuflw               m3, m3, 0x0
+  punpcklqdq            m2, m2
+  punpcklqdq            m3, m3
+  paddw                 m2, m0
+  paddw                 m3, m0
+  packuswb              m2, m3
+  movq      [dstq        ], m2
+  movhps    [dstq+strideq], m2
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  mova                  m0, [aboveq]
+  punpcklbw             m2, m1
+  punpckhbw             m4, m0, m1
+  punpcklbw             m0, m1
+  pshuflw               m2, m2, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -8
+  punpcklqdq            m2, m2
+  add                leftq, 16
+  psubw                 m0, m2
+  psubw                 m4, m2
+.loop:
+  movd                  m2, [leftq+lineq*2]
+  movd                  m3, [leftq+lineq*2+1]
+  punpcklbw             m2, m1
+  punpcklbw             m3, m1
+  pshuflw               m2, m2, 0x0
+  pshuflw               m3, m3, 0x0
+  punpcklqdq            m2, m2
+  punpcklqdq            m3, m3
+  paddw                 m5, m2, m0
+  paddw                 m6, m3, m0
+  paddw                 m2, m4
+  paddw                 m3, m4
+  packuswb              m5, m2
+  packuswb              m6, m3
+  mova      [dstq        ], m5
+  mova      [dstq+strideq], m6
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  mova                  m0, [aboveq]
+  mova                  m4, [aboveq+16]
+  punpcklbw             m2, m1
+  punpckhbw             m3, m0, m1
+  punpckhbw             m5, m4, m1
+  punpcklbw             m0, m1
+  punpcklbw             m4, m1
+  pshuflw               m2, m2, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -16
+  punpcklqdq            m2, m2
+  add                leftq, 32
+  psubw                 m0, m2
+  psubw                 m3, m2
+  psubw                 m4, m2
+  psubw                 m5, m2
+.loop:
+  movd                  m2, [leftq+lineq*2]
+  movd                  m6, [leftq+lineq*2+1]
+  punpcklbw             m2, m1
+  punpcklbw             m6, m1
+  pshuflw               m2, m2, 0x0
+  pshuflw               m6, m6, 0x0
+  punpcklqdq            m2, m2
+  punpcklqdq            m6, m6
+  paddw                 m7, m2, m0
+  paddw                 m8, m2, m3
+  paddw                 m9, m2, m4
+  paddw                 m2, m5
+  packuswb              m7, m8
+  packuswb              m9, m2
+  paddw                 m2, m6, m0
+  paddw                 m8, m6, m3
+  mova   [dstq           ], m7
+  paddw                 m7, m6, m4
+  paddw                 m6, m5
+  mova   [dstq        +16], m9
+  packuswb              m2, m8
+  packuswb              m7, m6
+  mova   [dstq+strideq   ], m2
+  mova   [dstq+strideq+16], m7
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+%endif
diff --git a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
new file mode 100644
index 0000000..bc8ed5c
--- /dev/null
+++ b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -0,0 +1,87 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+INIT_MMX ssse3
+cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  add                leftq, 4
+  mov                lineq, -2
+  pxor                  m0, m0
+.loop:
+  movd                  m1, [leftq+lineq*2  ]
+  movd                  m2, [leftq+lineq*2+1]
+  pshufb                m1, m0
+  pshufb                m2, m0
+  movd      [dstq        ], m1
+  movd      [dstq+strideq], m2
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_MMX ssse3
+cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  add                leftq, 8
+  mov                lineq, -4
+  pxor                  m0, m0
+.loop:
+  movd                  m1, [leftq+lineq*2  ]
+  movd                  m2, [leftq+lineq*2+1]
+  pshufb                m1, m0
+  pshufb                m2, m0
+  movq      [dstq        ], m1
+  movq      [dstq+strideq], m2
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM ssse3
+cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  add                leftq, 16
+  mov                lineq, -8
+  pxor                  m0, m0
+.loop:
+  movd                  m1, [leftq+lineq*2  ]
+  movd                  m2, [leftq+lineq*2+1]
+  pshufb                m1, m0
+  pshufb                m2, m0
+  mova      [dstq        ], m1
+  mova      [dstq+strideq], m2
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM ssse3
+cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  add                leftq, 32
+  mov                lineq, -16
+  pxor                  m0, m0
+.loop:
+  movd                  m1, [leftq+lineq*2  ]
+  movd                  m2, [leftq+lineq*2+1]
+  pshufb                m1, m0
+  pshufb                m2, m0
+  mova   [dstq           ], m1
+  mova   [dstq        +16], m1
+  mova   [dstq+strideq   ], m2
+  mova   [dstq+strideq+16], m2
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
diff --git a/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm b/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm
deleted file mode 100644
index 1af2521..0000000
--- a/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_1_mmx) PRIVATE
-sym(vp9_short_inv_walsh4x4_1_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rsi, arg(0)
-    mov     rax, 3
-
-    mov     rdi, arg(1)
-    add     rax, [rsi]          ;input[0] + 3
-
-    movd    mm0, eax
-
-    punpcklwd mm0, mm0          ;x x val val
-
-    punpckldq mm0, mm0          ;val val val val
-
-    psraw   mm0, 3            ;(input[0] + 3) >> 3
-
-    movq  [rdi + 0], mm0
-    movq  [rdi + 8], mm0
-    movq  [rdi + 16], mm0
-    movq  [rdi + 24], mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_short_inv_walsh4x4_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_mmx) PRIVATE
-sym(vp9_short_inv_walsh4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rax, 3
-    mov     rsi, arg(0)
-    mov     rdi, arg(1)
-    shl     rax, 16
-
-    movq    mm0, [rsi + 0]        ;ip[0]
-    movq    mm1, [rsi + 8]        ;ip[4]
-    or      rax, 3            ;00030003h
-
-    movq    mm2, [rsi + 16]       ;ip[8]
-    movq    mm3, [rsi + 24]       ;ip[12]
-
-    movq    mm7, rax
-    movq    mm4, mm0
-
-    punpcklwd mm7, mm7          ;0003000300030003h
-    movq    mm5, mm1
-
-    paddw   mm4, mm3          ;ip[0] + ip[12] aka al
-    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
-
-    movq    mm6, mm4          ;temp al
-
-    paddw   mm4, mm5          ;al + bl
-    psubw   mm6, mm5          ;al - bl
-
-    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
-    psubw   mm1, mm2          ;ip[4] - ip[8] aka c1
-
-    movq    mm5, mm0          ;temp dl
-
-    paddw   mm0, mm1          ;dl + cl
-    psubw   mm5, mm1          ;dl - cl
-
-    ; 03 02 01 00
-    ; 13 12 11 10
-    ; 23 22 21 20
-    ; 33 32 31 30
-
-    movq    mm3, mm4          ; 03 02 01 00
-    punpcklwd mm4, mm0          ; 11 01 10 00
-    punpckhwd mm3, mm0          ; 13 03 12 02
-
-    movq    mm1, mm6          ; 23 22 21 20
-    punpcklwd mm6, mm5          ; 31 21 30 20
-    punpckhwd mm1, mm5          ; 33 23 32 22
-
-    movq    mm0, mm4          ; 11 01 10 00
-    movq    mm2, mm3          ; 13 03 12 02
-
-    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
-    punpckhdq mm4, mm6          ; 31 21 11 01 aka ip[4]
-
-    punpckldq mm2, mm1          ; 32 22 12 02 aka ip[8]
-    punpckhdq mm3, mm1          ; 33 23 13 03 aka ip[12]
-;~~~~~~~~~~~~~~~~~~~~~
-    movq    mm1, mm0
-    movq    mm5, mm4
-
-    paddw   mm1, mm3          ;ip[0] + ip[12] aka al
-    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
-
-    movq    mm6, mm1          ;temp al
-
-    paddw   mm1, mm5          ;al + bl
-    psubw   mm6, mm5          ;al - bl
-
-    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
-    psubw   mm4, mm2          ;ip[4] - ip[8] aka c1
-
-    movq    mm5, mm0          ;temp dl
-
-    paddw   mm0, mm4          ;dl + cl
-    psubw   mm5, mm4          ;dl - cl
-;~~~~~~~~~~~~~~~~~~~~~
-    movq    mm3, mm1          ; 03 02 01 00
-    punpcklwd mm1, mm0          ; 11 01 10 00
-    punpckhwd mm3, mm0          ; 13 03 12 02
-
-    movq    mm4, mm6          ; 23 22 21 20
-    punpcklwd mm6, mm5          ; 31 21 30 20
-    punpckhwd mm4, mm5          ; 33 23 32 22
-
-    movq    mm0, mm1          ; 11 01 10 00
-    movq    mm2, mm3          ; 13 03 12 02
-
-    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
-    punpckhdq mm1, mm6          ; 31 21 11 01 aka ip[4]
-
-    punpckldq mm2, mm4          ; 32 22 12 02 aka ip[8]
-    punpckhdq mm3, mm4          ; 33 23 13 03 aka ip[12]
-
-    paddw   mm0, mm7
-    paddw   mm1, mm7
-    paddw   mm2, mm7
-    paddw   mm3, mm7
-
-    psraw   mm0, 3
-    psraw   mm1, 3
-    psraw   mm2, 3
-    psraw   mm3, 3
-
-    movq  [rdi + 0], mm0
-    movq  [rdi + 8], mm1
-    movq  [rdi + 16], mm2
-    movq  [rdi + 24], mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
diff --git a/libvpx/vp9/common/x86/vp9_iwalsh_sse2.asm b/libvpx/vp9/common/x86/vp9_iwalsh_sse2.asm
deleted file mode 100644
index 84fa2fe..0000000
--- a/libvpx/vp9/common/x86/vp9_iwalsh_sse2.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_inv_walsh4x4_sse2(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_sse2) PRIVATE
-sym(vp9_short_inv_walsh4x4_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    SAVE_XMM 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rsi, arg(0)
-    mov     rdi, arg(1)
-    mov     rax, 3
-
-    movdqa    xmm0, [rsi + 0]       ;ip[4] ip[0]
-    movdqa    xmm1, [rsi + 16]      ;ip[12] ip[8]
-
-    shl     rax, 16
-    or      rax, 3            ;00030003h
-
-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm0          ;ip[4] ip[0]
-
-    paddw   xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
-    movdqa    xmm4, xmm0
-    punpcklqdq  xmm0, xmm3          ;d1 a1
-    punpckhqdq  xmm4, xmm3          ;c1 b1
-    movd    xmm6, eax
-
-    movdqa    xmm1, xmm4          ;c1 b1
-    paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
-
-;;;temp output
-;;  movdqu  [rdi + 0], xmm4
-;;  movdqu  [rdi + 16], xmm3
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ; 13 12 11 10 03 02 01 00
-    ;
-    ; 33 32 31 30 23 22 21 20
-    ;
-    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02
-    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm4          ;ip[4] ip[0]
-
-    pshufd    xmm6, xmm6, 0       ;03 03 03 03 03 03 03 03
-
-    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
-    movdqa    xmm5, xmm4
-    punpcklqdq  xmm4, xmm3          ;d1 a1
-    punpckhqdq  xmm5, xmm3          ;c1 b1
-
-    movdqa    xmm1, xmm5          ;c1 b1
-    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ; 13 12 11 10 03 02 01 00
-    ;
-    ; 33 32 31 30 23 22 21 20
-    ;
-    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    paddw   xmm5, xmm6
-    paddw   xmm1, xmm6
-
-    psraw   xmm5, 3
-    psraw   xmm1, 3
-
-    movdqa  [rdi + 0], xmm5
-    movdqa  [rdi + 16], xmm1
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-x_s1sqr2:
-    times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1:
-    times 4 dw 0x4E7B
-align 16
-fours:
-    times 4 dw 0x0004
diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 50f890a..4af4f94 100644
--- a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -12,17 +12,11 @@
 #include "vp9/common/vp9_loopfilter.h"
 #include "vpx_ports/emmintrin_compat.h"
 
-prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2);
-prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2);
-
-extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2;
-extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2;
-
-void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
-                                       int p,
-                                       const unsigned char *_blimit,
-                                       const unsigned char *_limit,
-                                       const unsigned char *_thresh) {
+static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
+                                            int p,
+                                            const unsigned char *_blimit,
+                                            const unsigned char *_limit,
+                                            const unsigned char *_thresh) {
   DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]);
   DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]);
 
@@ -483,6 +477,490 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
   }
 }
 
+static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
+                                             int p,
+                                             const unsigned char *_blimit,
+                                             const unsigned char *_limit,
+                                             const unsigned char *_thresh) {
+  DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
+  DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
+
+  DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]);
+
+  DECLARE_ALIGNED(16, unsigned char, ap[8][16]);
+  DECLARE_ALIGNED(16, unsigned char, aq[8][16]);
+
+
+  __m128i mask, hev, flat, flat2;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi8(1);
+  __m128i p7, p6, p5;
+  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+  __m128i q5, q6, q7;
+  int i = 0;
+  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
+  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
+  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
+  const __m128i thresh =
+      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
+  const __m128i limit =
+      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
+  const __m128i blimit =
+      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
+
+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+
+  _mm_store_si128((__m128i *)ap[4], p4);
+  _mm_store_si128((__m128i *)ap[3], p3);
+  _mm_store_si128((__m128i *)ap[2], p2);
+  _mm_store_si128((__m128i *)ap[1], p1);
+  _mm_store_si128((__m128i *)ap[0], p0);
+  _mm_store_si128((__m128i *)aq[4], q4);
+  _mm_store_si128((__m128i *)aq[3], q3);
+  _mm_store_si128((__m128i *)aq[2], q2);
+  _mm_store_si128((__m128i *)aq[1], q1);
+  _mm_store_si128((__m128i *)aq[0], q0);
+
+
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                                    _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                                    _mm_subs_epu8(q1, p1));
+    __m128i work;
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+                                     _mm_subs_epu8(p1, p2)),
+                         _mm_or_si128(_mm_subs_epu8(p3, p2),
+                                      _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+                                     _mm_subs_epu8(q1, q2)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q2),
+                                      _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    __m128i ps1 = _mm_xor_si128(p1, t80);
+    __m128i ps0 = _mm_xor_si128(p0, t80);
+    __m128i qs0 = _mm_xor_si128(q0, t80);
+    __m128i qs1 = _mm_xor_si128(q1, t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    /* Filter1 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+    /* Filter2 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+    /* filt >> 1 */
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    // loopfilter done
+
+    {
+      __m128i work;
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+                                       _mm_subs_epu8(p0, p2)),
+                           _mm_or_si128(_mm_subs_epu8(q2, q0),
+                                        _mm_subs_epu8(q0, q2)));
+      flat = _mm_max_epu8(work, flat);
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+                                       _mm_subs_epu8(p0, p3)),
+                           _mm_or_si128(_mm_subs_epu8(q3, q0),
+                                        _mm_subs_epu8(q0, q3)));
+      flat = _mm_max_epu8(work, flat);
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
+                                       _mm_subs_epu8(p0, p4)),
+                           _mm_or_si128(_mm_subs_epu8(q4, q0),
+                                        _mm_subs_epu8(q0, q4)));
+      flat = _mm_subs_epu8(flat, one);
+      flat = _mm_cmpeq_epi8(flat, zero);
+      flat = _mm_and_si128(flat, mask);
+
+      p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+      q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
+                                       _mm_subs_epu8(p0, p5)),
+                           _mm_or_si128(_mm_subs_epu8(q5, q0),
+                                        _mm_subs_epu8(q0, q5)));
+      _mm_store_si128((__m128i *)ap[5], p5);
+      _mm_store_si128((__m128i *)aq[5], q5);
+      flat2 = _mm_max_epu8(work, flat2);
+      p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+      q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
+                                       _mm_subs_epu8(p0, p6)),
+                           _mm_or_si128(_mm_subs_epu8(q6, q0),
+                                        _mm_subs_epu8(q0, q6)));
+      _mm_store_si128((__m128i *)ap[6], p6);
+      _mm_store_si128((__m128i *)aq[6], q6);
+      flat2 = _mm_max_epu8(work, flat2);
+
+      p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
+      q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
+                                       _mm_subs_epu8(p0, p7)),
+                           _mm_or_si128(_mm_subs_epu8(q7, q0),
+                                        _mm_subs_epu8(q0, q7)));
+      _mm_store_si128((__m128i *)ap[7], p7);
+      _mm_store_si128((__m128i *)aq[7], q7);
+      flat2 = _mm_max_epu8(work, flat2);
+      flat2 = _mm_subs_epu8(flat2, one);
+      flat2 = _mm_cmpeq_epi8(flat2, zero);
+      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+    }
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+    {
+      const __m128i eight = _mm_set1_epi16(8);
+      const __m128i four = _mm_set1_epi16(4);
+      __m128i temp_flat2 = flat2;
+      unsigned char *src = s;
+      int i = 0;
+      do {
+        __m128i workp_shft;
+        __m128i a, b, c;
+
+        unsigned int off = i * 8;
+        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero);
+        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero);
+        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero);
+        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero);
+        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero);
+        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero);
+        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero);
+        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero);
+        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero);
+        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero);
+        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero);
+        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero);
+        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero);
+        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero);
+        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero);
+        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);
+
+        c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
+        c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
+
+        b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
+        a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
+        a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
+
+        _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q1, a);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
+        _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q2, a);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
+        _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q3, a);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
+        _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        b = _mm_add_epi16(q3, b);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
+        _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(q4, c);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        b = _mm_add_epi16(q3, b);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
+        _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+        a = _mm_add_epi16(q5, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q6, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        temp_flat2 = _mm_srli_si128(temp_flat2, 8);
+        src += 8;
+      } while (++i < 2);
+    }
+    // wide flat
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    work_a = _mm_load_si128((__m128i *)ap[2]);
+    p2 = _mm_load_si128((__m128i *)flat_op[2]);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    p2 = _mm_or_si128(work_a, p2);
+    _mm_store_si128((__m128i *)flat_op[2], p2);
+
+    p1 = _mm_load_si128((__m128i *)flat_op[1]);
+    work_a = _mm_andnot_si128(flat, ps1);
+    p1 = _mm_and_si128(flat, p1);
+    p1 = _mm_or_si128(work_a, p1);
+    _mm_store_si128((__m128i *)flat_op[1], p1);
+
+    p0 = _mm_load_si128((__m128i *)flat_op[0]);
+    work_a = _mm_andnot_si128(flat, ps0);
+    p0 = _mm_and_si128(flat, p0);
+    p0 = _mm_or_si128(work_a, p0);
+    _mm_store_si128((__m128i *)flat_op[0], p0);
+
+    q0 = _mm_load_si128((__m128i *)flat_oq[0]);
+    work_a = _mm_andnot_si128(flat, qs0);
+    q0 = _mm_and_si128(flat, q0);
+    q0 = _mm_or_si128(work_a, q0);
+    _mm_store_si128((__m128i *)flat_oq[0], q0);
+
+    q1 = _mm_load_si128((__m128i *)flat_oq[1]);
+    work_a = _mm_andnot_si128(flat, qs1);
+    q1 = _mm_and_si128(flat, q1);
+    q1 = _mm_or_si128(work_a, q1);
+    _mm_store_si128((__m128i *)flat_oq[1], q1);
+
+    work_a = _mm_load_si128((__m128i *)aq[2]);
+    q2 = _mm_load_si128((__m128i *)flat_oq[2]);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    q2 = _mm_or_si128(work_a, q2);
+    _mm_store_si128((__m128i *)flat_oq[2], q2);
+
+    // write out op6 - op3
+    {
+      unsigned char *dst = (s - 7 * p);
+      for (i = 6; i > 2; i--) {
+        __m128i flat2_output;
+        work_a = _mm_load_si128((__m128i *)ap[i]);
+        flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
+        work_a = _mm_andnot_si128(flat2, work_a);
+        flat2_output = _mm_and_si128(flat2, flat2_output);
+        work_a = _mm_or_si128(work_a, flat2_output);
+        _mm_storeu_si128((__m128i *)dst, work_a);
+        dst += p;
+      }
+    }
+
+    work_a = _mm_load_si128((__m128i *)flat_op[2]);
+    p2 = _mm_load_si128((__m128i *)flat2_op[2]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    p2 = _mm_and_si128(flat2, p2);
+    p2 = _mm_or_si128(work_a, p2);
+    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+
+    work_a = _mm_load_si128((__m128i *)flat_op[1]);
+    p1 = _mm_load_si128((__m128i *)flat2_op[1]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    p1 = _mm_and_si128(flat2, p1);
+    p1 = _mm_or_si128(work_a, p1);
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+
+    work_a = _mm_load_si128((__m128i *)flat_op[0]);
+    p0 = _mm_load_si128((__m128i *)flat2_op[0]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    p0 = _mm_and_si128(flat2, p0);
+    p0 = _mm_or_si128(work_a, p0);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+
+    work_a = _mm_load_si128((__m128i *)flat_oq[0]);
+    q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    q0 = _mm_and_si128(flat2, q0);
+    q0 = _mm_or_si128(work_a, q0);
+    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+
+    work_a = _mm_load_si128((__m128i *)flat_oq[1]);
+    q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    q1 = _mm_and_si128(flat2, q1);
+    q1 = _mm_or_si128(work_a, q1);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+
+    work_a = _mm_load_si128((__m128i *)flat_oq[2]);
+    q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    q2 = _mm_and_si128(flat2, q2);
+    q2 = _mm_or_si128(work_a, q2);
+    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+
+    // write out oq3 - oq7
+    {
+      unsigned char *dst = (s + 3 * p);
+      for (i = 3; i < 7; i++) {
+        __m128i flat2_output;
+        work_a = _mm_load_si128((__m128i *)aq[i]);
+        flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
+        work_a = _mm_andnot_si128(flat2, work_a);
+        flat2_output = _mm_and_si128(flat2, flat2_output);
+        work_a = _mm_or_si128(work_a, flat2_output);
+        _mm_storeu_si128((__m128i *)dst, work_a);
+        dst += p;
+      }
+    }
+  }
+}
+
+void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
+                                       int p,
+                                       const unsigned char *_blimit,
+                                       const unsigned char *_limit,
+                                       const unsigned char *_thresh,
+                                       int count) {
+  if (count == 1)
+    mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
+  else
+    mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
+}
+
 void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
                                             int p,
                                             const unsigned char *_blimit,
@@ -722,79 +1200,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
   }
 }
 
-void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u,
-                                               int p,
-                                               const unsigned char *_blimit,
-                                               const unsigned char *_limit,
-                                               const unsigned char *_thresh,
-                                               unsigned char *v) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, src, 160);
-
-  /* Read source */
-  const __m128i p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 5 * p)),
-      _mm_loadl_epi64((__m128i *)(v - 5 * p)));
-  const __m128i p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 4 * p)),
-      _mm_loadl_epi64((__m128i *)(v - 4 * p)));
-  const __m128i p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 3 * p)),
-      _mm_loadl_epi64((__m128i *)(v - 3 * p)));
-  const __m128i p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 2 * p)),
-      _mm_loadl_epi64((__m128i *)(v - 2 * p)));
-  const __m128i p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 1 * p)),
-      _mm_loadl_epi64((__m128i *)(v - 1 * p)));
-  const __m128i q0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u)),
-      _mm_loadl_epi64((__m128i *)(v)));
-  const __m128i q1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 1 * p)),
-      _mm_loadl_epi64((__m128i *)(v + 1 * p)));
-  const __m128i q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 2 * p)),
-      _mm_loadl_epi64((__m128i *)(v + 2 * p)));
-  const __m128i q3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 3 * p)),
-      _mm_loadl_epi64((__m128i *)(v + 3 * p)));
-  const __m128i q4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 4 * p)),
-      _mm_loadl_epi64((__m128i *)(v + 4 * p)));
-
-  _mm_store_si128((__m128i *)(src), p4);
-  _mm_store_si128((__m128i *)(src + 16), p3);
-  _mm_store_si128((__m128i *)(src + 32), p2);
-  _mm_store_si128((__m128i *)(src + 48), p1);
-  _mm_store_si128((__m128i *)(src + 64), p0);
-  _mm_store_si128((__m128i *)(src + 80), q0);
-  _mm_store_si128((__m128i *)(src + 96), q1);
-  _mm_store_si128((__m128i *)(src + 112), q2);
-  _mm_store_si128((__m128i *)(src + 128), q3);
-  _mm_store_si128((__m128i *)(src + 144), q4);
-
-  /* Loop filtering */
-  vp9_mbloop_filter_horizontal_edge_sse2(src + 80, 16, _blimit, _limit,
-                                         _thresh, 1);
-
-  /* Store result */
-  _mm_storel_epi64((__m128i *)(u - 3 * p),
-                   _mm_loadl_epi64((__m128i *)(src + 32)));
-  _mm_storel_epi64((__m128i *)(u - 2 * p),
-                   _mm_loadl_epi64((__m128i *)(src + 48)));
-  _mm_storel_epi64((__m128i *)(u - p),
-                   _mm_loadl_epi64((__m128i *)(src + 64)));
-  _mm_storel_epi64((__m128i *)u,
-                   _mm_loadl_epi64((__m128i *)(src + 80)));
-  _mm_storel_epi64((__m128i *)(u + p),
-                   _mm_loadl_epi64((__m128i *)(src + 96)));
-  _mm_storel_epi64((__m128i *)(u + 2 * p),
-                   _mm_loadl_epi64((__m128i *)(src + 112)));
-
-  _mm_storel_epi64((__m128i *)(v - 3 * p),
-                   _mm_loadl_epi64((__m128i *)(src + 40)));
-  _mm_storel_epi64((__m128i *)(v - 2 * p),
-                   _mm_loadl_epi64((__m128i *)(src + 56)));
-  _mm_storel_epi64((__m128i *)(v - p),
-                   _mm_loadl_epi64((__m128i *)(src + 72)));
-  _mm_storel_epi64((__m128i *)v,
-                   _mm_loadl_epi64((__m128i *)(src + 88)));
-  _mm_storel_epi64((__m128i *)(v + p),
-                   _mm_loadl_epi64((__m128i *)(src + 104)));
-  _mm_storel_epi64((__m128i *)(v + 2 * p),
-                   _mm_loadl_epi64((__m128i *)(src + 120)));
-}
-
 static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
                                  int in_p, unsigned char *out, int out_p) {
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
@@ -941,7 +1346,7 @@ void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
 
   /* Loop filtering */
   vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                           thresh, 1);
+                                         thresh, 1);
   src[0] = t_dst + 3 * 16;
   src[1] = t_dst + 3 * 16 + 8;
 
@@ -953,10 +1358,10 @@ void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
 }
 
 void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
-                                          int p,
-                                          const unsigned char *blimit,
-                                          const unsigned char *limit,
-                                          const unsigned char *thresh) {
+                                     int p,
+                                     const unsigned char *blimit,
+                                     const unsigned char *limit,
+                                     const unsigned char *thresh) {
   DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
   unsigned char *src[4];
   unsigned char *dst[4];
@@ -972,7 +1377,7 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
 
   /* Loop filtering */
   vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                           thresh);
+                                    thresh, 1);
 
   src[0] = t_dst;
   src[1] = t_dst + 8 * 16;
@@ -982,32 +1387,3 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
 
   transpose(src, 16, dst, p, 2);
 }
-
-
-void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u,
-                                             int p,
-                                             const unsigned char *blimit,
-                                             const unsigned char *limit,
-                                             const unsigned char *thresh,
-                                             unsigned char *v) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
-  unsigned char *src[2];
-  unsigned char *dst[2];
-
-  /* Transpose 16x16 */
-  transpose8x16(u - 8, v - 8, p, t_dst, 16);
-  transpose8x16(u, v, p, t_dst + 16 * 8, 16);
-
-  /* Loop filtering */
-  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                           thresh, 1);
-
-  src[0] = t_dst + 3 * 16;
-  src[1] = t_dst + 3 * 16 + 8;
-
-  dst[0] = u - 5;
-  dst[1] = v - 5;
-
-  /* Transpose 16x8 */
-  transpose(src, 16, dst, p, 2);
-}
diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm b/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm
deleted file mode 100644
index 74236cf..0000000
--- a/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm
+++ /dev/null
@@ -1,872 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; Use of pmaxub instead of psubusb to compute filter mask was seen
-; in ffvp8
-
-%macro LFH_FILTER_AND_HEV_MASK 1
-%if %1
-        movdqa      xmm2,                   [rdi+2*rax]       ; q3
-        movdqa      xmm1,                   [rsi+2*rax]       ; q2
-        movdqa      xmm4,                   [rsi+rax]         ; q1
-        movdqa      xmm5,                   [rsi]             ; q0
-        neg         rax                     ; negate pitch to deal with above border
-%else
-        movlps      xmm2,                   [rsi + rcx*2]     ; q3
-        movlps      xmm1,                   [rsi + rcx]       ; q2
-        movlps      xmm4,                   [rsi]             ; q1
-        movlps      xmm5,                   [rsi + rax]       ; q0
-
-        movhps      xmm2,                   [rdi + rcx*2]
-        movhps      xmm1,                   [rdi + rcx]
-        movhps      xmm4,                   [rdi]
-        movhps      xmm5,                   [rdi + rax]
-
-        lea         rsi,                    [rsi + rax*4]
-        lea         rdi,                    [rdi + rax*4]
-
-        movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
-        movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
-%endif
-
-        movdqa      xmm6,                   xmm1              ; q2
-        movdqa      xmm3,                   xmm4              ; q1
-
-        psubusb     xmm1,                   xmm2              ; q2-=q3
-        psubusb     xmm2,                   xmm6              ; q3-=q2
-
-        psubusb     xmm4,                   xmm6              ; q1-=q2
-        psubusb     xmm6,                   xmm3              ; q2-=q1
-
-        por         xmm4,                   xmm6              ; abs(q2-q1)
-        por         xmm1,                   xmm2              ; abs(q3-q2)
-
-        movdqa      xmm0,                   xmm5              ; q0
-        pmaxub      xmm1,                   xmm4
-
-        psubusb     xmm5,                   xmm3              ; q0-=q1
-        psubusb     xmm3,                   xmm0              ; q1-=q0
-
-        por         xmm5,                   xmm3              ; abs(q0-q1)
-        movdqa      t0,                     xmm5              ; save to t0
-
-        pmaxub      xmm1,                   xmm5
-
-%if %1
-        movdqa      xmm2,                   [rsi+4*rax]       ; p3
-        movdqa      xmm4,                   [rdi+4*rax]       ; p2
-        movdqa      xmm6,                   [rsi+2*rax]       ; p1
-%else
-        movlps      xmm2,                   [rsi + rax]       ; p3
-        movlps      xmm4,                   [rsi]             ; p2
-        movlps      xmm6,                   [rsi + rcx]       ; p1
-
-        movhps      xmm2,                   [rdi + rax]
-        movhps      xmm4,                   [rdi]
-        movhps      xmm6,                   [rdi + rcx]
-
-        movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
-        movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1
-%endif
-
-        movdqa      xmm5,                   xmm4              ; p2
-        movdqa      xmm3,                   xmm6              ; p1
-
-        psubusb     xmm4,                   xmm2              ; p2-=p3
-        psubusb     xmm2,                   xmm5              ; p3-=p2
-
-        psubusb     xmm3,                   xmm5              ; p1-=p2
-        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
-
-        psubusb     xmm5,                   xmm6              ; p2-=p1
-        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
-
-        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
-        movdqa      xmm2,                   xmm6              ; p1
-
-        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
-%if %1
-        movdqa      xmm4,                   [rsi+rax]         ; p0
-        movdqa      xmm3,                   [rdi]             ; q1
-%else
-        movlps      xmm4,                   [rsi + rcx*2]     ; p0
-        movhps      xmm4,                   [rdi + rcx*2]
-        movdqa      xmm3,                   q1                ; q1
-%endif
-
-        movdqa      xmm5,                   xmm4              ; p0
-        psubusb     xmm4,                   xmm6              ; p0-=p1
-
-        psubusb     xmm6,                   xmm5              ; p1-=p0
-
-        por         xmm6,                   xmm4              ; abs(p1 - p0)
-        mov         rdx,                    arg(2)            ; get blimit
-
-        movdqa        t1,                   xmm6              ; save to t1
-
-        movdqa      xmm4,                   xmm3              ; q1
-        pmaxub      xmm1,                   xmm6
-
-        psubusb     xmm3,                   xmm2              ; q1-=p1
-        psubusb     xmm2,                   xmm4              ; p1-=q1
-
-        psubusb     xmm1,                   xmm7
-        por         xmm2,                   xmm3              ; abs(p1-q1)
-
-        movdqa      xmm7,                   XMMWORD PTR [rdx] ; blimit
-
-        movdqa      xmm3,                   xmm0              ; q0
-        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
-
-        mov         rdx,                    arg(4)            ; hev get thresh
-
-        movdqa      xmm6,                   xmm5              ; p0
-        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
-
-        psubusb     xmm5,                   xmm3              ; p0-=q0
-
-        psubusb     xmm3,                   xmm6              ; q0-=p0
-        por         xmm5,                   xmm3              ; abs(p0 - q0)
-
-        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
-
-        movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)
-
-        movdqa      xmm3,                   t1                ; get abs (p1 - p0)
-
-        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
-
-        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        psubusb     xmm4,                   xmm2              ; hev
-
-        psubusb     xmm3,                   xmm2              ; hev
-        por         xmm1,                   xmm5
-
-        pxor        xmm7,                   xmm7
-        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
-        pcmpeqb     xmm4,                   xmm5              ; hev
-        pcmpeqb     xmm3,                   xmm3              ; hev
-
-        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
-        pxor        xmm4,                   xmm3              ; hev
-%endmacro
-
-%macro B_FILTER 1
-%if %1 == 0
-        movdqa      xmm2,                   p1                ; p1
-        movdqa      xmm7,                   q1                ; q1
-%elif %1 == 1
-        movdqa      xmm2,                   [rsi+2*rax]       ; p1
-        movdqa      xmm7,                   [rdi]             ; q1
-%elif %1 == 2
-        lea         rdx,                    srct
-
-        movdqa      xmm2,                   [rdx]             ; p1
-        movdqa      xmm7,                   [rdx+48]          ; q1
-        movdqa      xmm6,                   [rdx+16]          ; p0
-        movdqa      xmm0,                   [rdx+32]          ; q0
-%endif
-
-        pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values
-        pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values
-
-        psubsb      xmm2,                   xmm7              ; p1 - q1
-        pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values
-
-        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
-        pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values
-
-        movdqa      xmm3,                   xmm0              ; q0
-        psubsb      xmm0,                   xmm6              ; q0 - p0
-
-        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
-
-        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
-
-        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
-
-        pand        xmm1,                   xmm2              ; mask filter values we don't care about
-
-        movdqa      xmm2,                   xmm1
-
-        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
-        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
-        punpcklbw   xmm2,                   xmm2              ; exfxgxhx
-
-        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
-        psraw       xmm5,                   11                ; sign extended shift right by 3
-
-        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
-        psraw       xmm2,                   11                ; sign extended shift right by 3
-
-        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-        psraw       xmm0,                   11                ; sign extended shift right by 3
-
-        psraw       xmm1,                   11                ; sign extended shift right by 3
-        movdqa      xmm5,                   xmm0              ; save results
-
-        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      xmm5,                   [GLOBAL(ones)]
-
-        paddsw      xmm1,                   [GLOBAL(ones)]
-        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
-
-        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
-
-        paddsb      xmm6,                   xmm2              ; p0+= p0 add
-        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
-%if %1 == 0
-        movdqa      xmm1,                   p1                ; p1
-%elif %1 == 1
-        movdqa      xmm1,                   [rsi+2*rax]       ; p1
-%elif %1 == 2
-        movdqa      xmm1,                   [rdx]             ; p1
-%endif
-        pandn       xmm4,                   xmm5              ; high edge variance additive
-        pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset
-
-        pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset
-        psubsb      xmm3,                   xmm0              ; q0-= q0 add
-
-        paddsb      xmm1,                   xmm4              ; p1+= p1 add
-        pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset
-
-        pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset
-        psubsb      xmm7,                   xmm4              ; q1-= q1 add
-
-        pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset
-%if %1 == 0
-        lea         rsi,                    [rsi + rcx*2]
-        lea         rdi,                    [rdi + rcx*2]
-        movq        MMWORD PTR [rsi],       xmm6              ; p0
-        movhps      MMWORD PTR [rdi],       xmm6
-        movq        MMWORD PTR [rsi + rax], xmm1              ; p1
-        movhps      MMWORD PTR [rdi + rax], xmm1
-        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
-        movhps      MMWORD PTR [rdi + rcx], xmm3
-        movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
-        movhps      MMWORD PTR [rdi + rcx*2],xmm7
-%elif %1 == 1
-        movdqa      [rsi+rax],              xmm6              ; write back
-        movdqa      [rsi+2*rax],            xmm1              ; write back
-        movdqa      [rsi],                  xmm3              ; write back
-        movdqa      [rdi],                  xmm7              ; write back
-%endif
-
-%endmacro
-
-
-;void vp9_loop_filter_horizontal_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    int            count
-;)
-global sym(vp9_loop_filter_horizontal_edge_sse2) PRIVATE
-sym(vp9_loop_filter_horizontal_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32     ; reserve 32 bytes
-    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
-
-        mov         rsi,                    arg(0)           ;src_ptr
-        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
-
-        mov         rdx,                    arg(3)           ;limit
-        movdqa      xmm7,                   XMMWORD PTR [rdx]
-
-        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
-
-        ; calculate breakout conditions and high edge variance
-        LFH_FILTER_AND_HEV_MASK 1
-        ; filter and write back the result
-        B_FILTER 1
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_horizontal_edge_uv_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    int            count
-;)
-global sym(vp9_loop_filter_horizontal_edge_uv_sse2) PRIVATE
-sym(vp9_loop_filter_horizontal_edge_uv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 96       ; reserve 96 bytes
-    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
-    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
-    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
-    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
-    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
-    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
-
-        mov         rsi,                    arg(0)             ; u
-        mov         rdi,                    arg(5)             ; v
-        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
-        mov         rcx,                    rax
-        neg         rax                     ; negate pitch to deal with above border
-
-        mov         rdx,                    arg(3)             ;limit
-        movdqa      xmm7,                   XMMWORD PTR [rdx]
-
-        lea         rsi,                    [rsi + rcx]
-        lea         rdi,                    [rdi + rcx]
-
-        ; calculate breakout conditions and high edge variance
-        LFH_FILTER_AND_HEV_MASK 0
-        ; filter and write back the result
-        B_FILTER 0
-
-    add rsp, 96
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-%macro TRANSPOSE_16X8 2
-        movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
-        movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
-        movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
-        movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
-        movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
-        movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
-
-        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
-
-        movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
-
-        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
-        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
-
-        movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
-
-        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-%if %1
-        lea         rsi,                [rsi+rax*8]
-%else
-        mov         rsi,                arg(5)          ; v_ptr
-%endif
-
-        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
-
-        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
-        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
-%if %1
-        lea         rdi,                [rdi+rax*8]
-%else
-        lea         rsi,                [rsi - 4]
-%endif
-
-        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-%if %1
-        lea         rdx,                srct
-%else
-        lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing
-%endif
-
-        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-
-        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
-        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
-        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-
-        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-
-        movdqa      t0,                 xmm2            ; save to free XMM2
-        movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
-        movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
-        movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
-        movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
-        movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
-
-        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
-        movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
-
-        punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
-
-        movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
-
-        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
-
-        movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
-
-        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
-
-        movdqa      xmm6,               xmm1            ;
-        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
-
-        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
-        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-
-        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
-        movdqa      xmm0,               xmm5
-        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-
-        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
-        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
-
-        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
-        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
-        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
-
-        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
-%if %2
-        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
-        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
-        movdqa      [rdx],              xmm2            ; save 2
-
-        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
-        movdqa      [rdx+16],           xmm3            ; save 3
-
-        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
-        movdqa      [rdx+32],           xmm4            ; save 4
-        movdqa      [rdx+48],           xmm5            ; save 5
-        movdqa      xmm1,               t0              ; get
-
-        movdqa      xmm2,               xmm1            ;
-        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-
-        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-%else
-        movdqa      [rdx+112],          xmm7            ; save 7
-
-        movdqa      [rdx+96],           xmm6            ; save 6
-
-        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
-        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
-        movdqa      [rdx+32],           xmm2            ; save 2
-
-        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
-        movdqa      [rdx+48],           xmm3            ; save 3
-
-        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
-        movdqa      [rdx+64],           xmm4            ; save 4
-        movdqa      [rdx+80],           xmm5            ; save 5
-        movdqa      xmm1,               t0              ; get
-
-        movdqa      xmm2,               xmm1
-        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-
-        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-
-        movdqa      [rdx+16],           xmm1
-
-        movdqa      [rdx],              xmm2
-%endif
-%endmacro
-
-%macro LFV_FILTER_MASK_HEV_MASK 1
-        movdqa      xmm0,               xmm6            ; q2
-        psubusb     xmm0,               xmm7            ; q2-q3
-
-        psubusb     xmm7,               xmm6            ; q3-q2
-        movdqa      xmm4,               xmm5            ; q1
-
-        por         xmm7,               xmm0            ; abs (q3-q2)
-        psubusb     xmm4,               xmm6            ; q1-q2
-
-        movdqa      xmm0,               xmm1
-        psubusb     xmm6,               xmm5            ; q2-q1
-
-        por         xmm6,               xmm4            ; abs (q2-q1)
-        psubusb     xmm0,               xmm2            ; p2 - p3;
-
-        psubusb     xmm2,               xmm1            ; p3 - p2;
-        por         xmm0,               xmm2            ; abs(p2-p3)
-%if %1
-        movdqa      xmm2,               [rdx]           ; p1
-%else
-        movdqa      xmm2,               [rdx+32]        ; p1
-%endif
-        movdqa      xmm5,               xmm2            ; p1
-        pmaxub      xmm0,               xmm7
-
-        psubusb     xmm5,               xmm1            ; p1-p2
-        psubusb     xmm1,               xmm2            ; p2-p1
-
-        movdqa      xmm7,               xmm3            ; p0
-        psubusb     xmm7,               xmm2            ; p0-p1
-
-        por         xmm1,               xmm5            ; abs(p2-p1)
-        pmaxub      xmm0,               xmm6
-
-        pmaxub      xmm0,               xmm1
-        movdqa      xmm1,               xmm2            ; p1
-
-        psubusb     xmm2,               xmm3            ; p1-p0
-        lea         rdx,                srct
-
-        por         xmm2,               xmm7            ; abs(p1-p0)
-
-        movdqa      t0,                 xmm2            ; save abs(p1-p0)
-
-        pmaxub      xmm0,               xmm2
-
-%if %1
-        movdqa      xmm5,               [rdx+32]        ; q0
-        movdqa      xmm7,               [rdx+48]        ; q1
-%else
-        movdqa      xmm5,               [rdx+64]        ; q0
-        movdqa      xmm7,               [rdx+80]        ; q1
-%endif
-        mov         rdx,                arg(3)          ; limit
-
-        movdqa      xmm6,               xmm5            ; q0
-        movdqa      xmm2,               xmm7            ; q1
-
-        psubusb     xmm5,               xmm7            ; q0-q1
-        psubusb     xmm7,               xmm6            ; q1-q0
-
-        por         xmm7,               xmm5            ; abs(q1-q0)
-
-        movdqa      t1,                 xmm7            ; save abs(q1-q0)
-
-        movdqa      xmm4,               XMMWORD PTR [rdx]; limit
-
-        pmaxub      xmm0,               xmm7
-        mov         rdx,                arg(2)          ; blimit
-
-        psubusb     xmm0,               xmm4
-        movdqa      xmm5,               xmm2            ; q1
-
-        psubusb     xmm5,               xmm1            ; q1-=p1
-        psubusb     xmm1,               xmm2            ; p1-=q1
-
-        por         xmm5,               xmm1            ; abs(p1-q1)
-        movdqa      xmm1,               xmm3            ; p0
-
-        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
-        psubusb     xmm1,               xmm6            ; p0-q0
-
-        psrlw       xmm5,               1               ; abs(p1-q1)/2
-        psubusb     xmm6,               xmm3            ; q0-p0
-
-        movdqa      xmm4,               XMMWORD PTR [rdx]; blimit
-
-        mov         rdx,                arg(4)          ; get thresh
-
-        por         xmm1,               xmm6            ; abs(q0-p0)
-
-        movdqa      xmm6,               t0              ; get abs (q1 - q0)
-
-        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
-
-        movdqa      xmm3,               t1              ; get abs (p1 - p0)
-
-        movdqa      xmm7,               XMMWORD PTR [rdx]
-
-        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
-        psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
-
-        psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
-
-        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
-        por         xmm1,               xmm0            ; mask
-        pcmpeqb     xmm6,               xmm0
-
-        pxor        xmm0,               xmm0
-        pcmpeqb     xmm4,               xmm4
-
-        pcmpeqb     xmm1,               xmm0
-        pxor        xmm4,               xmm6
-%endmacro
-
-%macro BV_TRANSPOSE 0
-        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-
-        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
-
-        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
-
-        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
-
-        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
-        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
-
-        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
-        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
-        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
-        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
-        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
-%endmacro
-
-%macro BV_WRITEBACK 2
-        movd        [rsi+2],            %1
-        psrldq      %1,                 4
-
-        movd        [rdi+2],            %1
-        psrldq      %1,                 4
-
-        movd        [rsi+2*rax+2],      %1
-        psrldq      %1,                 4
-
-        movd        [rdi+2*rax+2],      %1
-
-        movd        [rsi+4*rax+2],      %2
-        psrldq      %2,                 4
-
-        movd        [rdi+4*rax+2],      %2
-        psrldq      %2,                 4
-
-        movd        [rsi+2*rcx+2],      %2
-        psrldq      %2,                 4
-
-        movd        [rdi+2*rcx+2],      %2
-%endmacro
-
-
-;void vp9_loop_filter_vertical_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    int            count
-;)
-global sym(vp9_loop_filter_vertical_edge_sse2) PRIVATE
-sym(vp9_loop_filter_vertical_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub             rsp, 96      ; reserve 96 bytes
-    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
-    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
-
-        mov         rsi,        arg(0)                  ; src_ptr
-        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
-
-        lea         rsi,        [rsi - 4]
-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
-        lea         rcx,        [rax*2+rax]
-
-        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
-        TRANSPOSE_16X8 1, 1
-
-        ; calculate filter mask and high edge variance
-        LFV_FILTER_MASK_HEV_MASK 1
-
-        ; start work on filters
-        B_FILTER 2
-
-        ; tranpose and write back - only work on q1, q0, p0, p1
-        BV_TRANSPOSE
-        ; store 16-line result
-
-        lea         rdx,        [rax]
-        neg         rdx
-
-        BV_WRITEBACK xmm1, xmm5
-
-        lea         rsi,        [rsi+rdx*8]
-        lea         rdi,        [rdi+rdx*8]
-        BV_WRITEBACK xmm2, xmm6
-
-    add rsp, 96
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_vertical_edge_uv_sse2
-;(
-;    unsigned char *u,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    unsigned char *v
-;)
-global sym(vp9_loop_filter_vertical_edge_uv_sse2) PRIVATE
-sym(vp9_loop_filter_vertical_edge_uv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub             rsp, 96      ; reserve 96 bytes
-    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
-    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
-
-        mov         rsi,        arg(0)                  ; u_ptr
-        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
-
-        lea         rsi,        [rsi - 4]
-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
-        lea         rcx,        [rax+2*rax]
-
-        lea         rdx,        srct
-
-        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
-        TRANSPOSE_16X8 0, 1
-
-        ; calculate filter mask and high edge variance
-        LFV_FILTER_MASK_HEV_MASK 1
-
-        ; start work on filters
-        B_FILTER 2
-
-        ; tranpose and write back - only work on q1, q0, p0, p1
-        BV_TRANSPOSE
-
-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
-
-        ; store 16-line result
-        BV_WRITEBACK xmm1, xmm5
-
-        mov         rsi,        arg(0)                  ; u_ptr
-        lea         rsi,        [rsi - 4]
-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
-        BV_WRITEBACK xmm2, xmm6
-
-    add rsp, 96
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-tfe:
-    times 16 db 0xfe
-align 16
-t80:
-    times 16 db 0x80
-align 16
-t1s:
-    times 16 db 0x01
-align 16
-t3:
-    times 16 db 0x03
-align 16
-t4:
-    times 16 db 0x04
-align 16
-ones:
-    times 8 dw 0x0001
-align 16
-s9:
-    times 8 dw 0x0900
-align 16
-s63:
-    times 8 dw 0x003f
diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_x86.h b/libvpx/vp9/common/x86/vp9_loopfilter_x86.h
deleted file mode 100644
index fb5af05..0000000
--- a/libvpx/vp9/common/x86/vp9_loopfilter_x86.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_X86_VP9_LOOPFILTER_X86_H_
-#define VP9_COMMON_X86_VP9_LOOPFILTER_X86_H_
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
-#endif
-
-#if HAVE_SSE2
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
-#endif
-
-#endif  // LOOPFILTER_X86_H
diff --git a/libvpx/vp9/common/x86/vp9_mask_sse3.asm b/libvpx/vp9/common/x86/vp9_mask_sse3.asm
deleted file mode 100644
index fe46823..0000000
--- a/libvpx/vp9/common/x86/vp9_mask_sse3.asm
+++ /dev/null
@@ -1,484 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void int vp8_makemask_sse3(
-;    unsigned char *y,
-;    unsigned char *u,
-;    unsigned char *v,
-;    unsigned char *ym,
-;    unsigned char *uvm,
-;    int yp,
-;    int uvp,
-;    int ys,
-;    int us,
-;    int vs,
-;    int yt,
-;    int ut,
-;    int vt)
-global sym(vp8_makemask_sse3) PRIVATE
-sym(vp8_makemask_sse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 14
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;y
-        mov             rdi,        arg(1) ;u
-        mov             rcx,        arg(2) ;v
-        mov             rax,        arg(3) ;ym
-        movsxd          rbx,        dword arg(4) ;yp
-        movsxd          rdx,        dword arg(5) ;uvp
-
-        pxor            xmm0,xmm0
-
-        ;make 16 copies of the center y value
-        movd            xmm1, arg(6)
-        pshufb          xmm1, xmm0
-
-        ; make 16 copies of the center u value
-        movd            xmm2, arg(7)
-        pshufb          xmm2, xmm0
-
-        ; make 16 copies of the center v value
-        movd            xmm3, arg(8)
-        pshufb          xmm3, xmm0
-        unpcklpd        xmm2, xmm3
-
-        ;make 16 copies of the y tolerance
-        movd            xmm3, arg(9)
-        pshufb          xmm3, xmm0
-
-        ;make 16 copies of the u tolerance
-        movd            xmm4, arg(10)
-        pshufb          xmm4, xmm0
-
-        ;make 16 copies of the v tolerance
-        movd            xmm5, arg(11)
-        pshufb          xmm5, xmm0
-        unpckhpd        xmm4, xmm5
-
-        mov             r8,8
-
-NextPairOfRows:
-
-        ;grab the y source values
-        movdqu          xmm0, [rsi]
-
-        ;compute abs difference between source and y target
-        movdqa          xmm6, xmm1
-        movdqa          xmm7, xmm0
-        psubusb         xmm0, xmm1
-        psubusb         xmm6, xmm7
-        por             xmm0, xmm6
-
-        ;compute abs difference between
-        movdqa          xmm6, xmm3
-        pcmpgtb         xmm6, xmm0
-
-        ;grab the y source values
-        add             rsi, rbx
-        movdqu          xmm0, [rsi]
-
-        ;compute abs difference between source and y target
-        movdqa          xmm11, xmm1
-        movdqa          xmm7, xmm0
-        psubusb         xmm0, xmm1
-        psubusb         xmm11, xmm7
-        por             xmm0, xmm11
-
-        ;compute abs difference between
-        movdqa          xmm11, xmm3
-        pcmpgtb         xmm11, xmm0
-
-
-        ;grab the u and v source values
-        movdqu          xmm7, [rdi]
-        movdqu          xmm8, [rcx]
-        unpcklpd        xmm7, xmm8
-
-        ;compute abs difference between source and uv targets
-        movdqa          xmm9, xmm2
-        movdqa          xmm10, xmm7
-        psubusb         xmm7, xmm2
-        psubusb         xmm9, xmm10
-        por             xmm7, xmm9
-
-        ;check whether the number is < tolerance
-        movdqa          xmm0, xmm4
-        pcmpgtb         xmm0, xmm7
-
-        ;double  u and v masks
-        movdqa          xmm8, xmm0
-        punpckhbw       xmm0, xmm0
-        punpcklbw       xmm8, xmm8
-
-        ;mask row 0 and output
-        pand            xmm6, xmm8
-        pand            xmm6, xmm0
-        movdqa          [rax],xmm6
-
-        ;mask row 1 and output
-        pand            xmm11, xmm8
-        pand            xmm11, xmm0
-        movdqa          [rax+16],xmm11
-
-
-        ; to the next row or set of rows
-        add             rsi, rbx
-        add             rdi, rdx
-        add             rcx, rdx
-        add             rax,32
-        dec r8
-        jnz NextPairOfRows
-
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;GROW_HORIZ (register for result, source register or mem local)
-; takes source and shifts left and ors with source
-; then shifts right and ors with source
-%macro GROW_HORIZ 2
-    movdqa          %1, %2
-    movdqa          xmm14, %1
-    movdqa          xmm15, %1
-    pslldq          xmm14, 1
-    psrldq          xmm15, 1
-    por             %1,xmm14
-    por             %1,xmm15
-%endmacro
-;GROW_VERT (result, center row, above row, below row)
-%macro GROW_VERT 4
-    movdqa          %1,%2
-    por             %1,%3
-    por             %1,%4
-%endmacro
-
-;GROW_NEXTLINE (new line to grow, new source, line to write)
-%macro GROW_NEXTLINE 3
-    GROW_HORIZ %1, %2
-    GROW_VERT xmm3, xmm0, xmm1, xmm2
-    movdqa %3,xmm3
-%endmacro
-
-
-;void int vp8_growmaskmb_sse3(
-;    unsigned char *om,
-;    unsigned char *nm,
-global sym(vp8_growmaskmb_sse3) PRIVATE
-sym(vp8_growmaskmb_sse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;src
-    mov             rdi,        arg(1) ;rst
-
-    GROW_HORIZ xmm0, [rsi]
-    GROW_HORIZ xmm1, [rsi+16]
-    GROW_HORIZ xmm2, [rsi+32]
-
-    GROW_VERT xmm3, xmm0, xmm1, xmm2
-    por xmm0,xmm1
-    movdqa [rdi], xmm0
-    movdqa [rdi+16],xmm3
-
-    GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]
-    GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]
-    GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]
-    GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]
-    GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]
-    GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]
-    GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]
-    GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]
-    GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]
-    GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]
-    GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]
-    GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]
-    GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]
-
-    por xmm0,xmm2
-    movdqa [rdi+240], xmm0
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;unsigned int vp8_sad16x16_masked_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned char *mask)
-global sym(vp8_sad16x16_masked_wmt) PRIVATE
-sym(vp8_sad16x16_masked_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(2) ;ref_ptr
-
-    mov             rbx,        arg(4) ;mask
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    mov             rcx,        16
-
-    pxor            xmm3,       xmm3
-
-NextSadRow:
-    movdqu          xmm0,       [rsi]
-    movdqu          xmm1,       [rdi]
-    movdqu          xmm2,       [rbx]
-    pand            xmm0,       xmm2
-    pand            xmm1,       xmm2
-
-    psadbw          xmm0,       xmm1
-    paddw           xmm3,       xmm0
-
-    add             rsi, rax
-    add             rdi, rdx
-    add             rbx,  16
-
-    dec rcx
-    jnz NextSadRow
-
-    movdqa          xmm4 ,     xmm3
-    psrldq          xmm4,       8
-    paddw           xmm3,      xmm4
-    movq            rax,       xmm3
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_sad16x16_unmasked_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned char *mask)
-global sym(vp8_sad16x16_unmasked_wmt) PRIVATE
-sym(vp8_sad16x16_unmasked_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(2) ;ref_ptr
-
-    mov             rbx,        arg(4) ;mask
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    mov             rcx,        16
-
-    pxor            xmm3,       xmm3
-
-next_vp8_sad16x16_unmasked_wmt:
-    movdqu          xmm0,       [rsi]
-    movdqu          xmm1,       [rdi]
-    movdqu          xmm2,       [rbx]
-    por             xmm0,       xmm2
-    por             xmm1,       xmm2
-
-    psadbw          xmm0,       xmm1
-    paddw           xmm3,       xmm0
-
-    add             rsi, rax
-    add             rdi, rdx
-    add             rbx,  16
-
-    dec rcx
-    jnz next_vp8_sad16x16_unmasked_wmt
-
-    movdqa          xmm4 ,     xmm3
-    psrldq          xmm4,       8
-    paddw           xmm3,      xmm4
-    movq            rax,        xmm3
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_masked_predictor_wmt(
-;    unsigned char *masked,
-;    unsigned char *unmasked,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    unsigned char *mask)
-global sym(vp8_masked_predictor_wmt) PRIVATE
-sym(vp8_masked_predictor_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(1) ;ref_ptr
-
-    mov             rbx,        arg(5) ;mask
-    movsxd          rax,        dword ptr arg(2) ;src_stride
-    mov             r11,        arg(3) ; destination
-    movsxd          rdx,        dword ptr arg(4) ;dst_stride
-
-    mov             rcx,        16
-
-    pxor            xmm3,       xmm3
-
-next_vp8_masked_predictor_wmt:
-    movdqu          xmm0,       [rsi]
-    movdqu          xmm1,       [rdi]
-    movdqu          xmm2,       [rbx]
-
-    pand            xmm0,       xmm2
-    pandn           xmm2,       xmm1
-    por             xmm0,       xmm2
-    movdqu          [r11],      xmm0
-
-    add             r11, rdx
-    add             rsi, rax
-    add             rdi, rdx
-    add             rbx,  16
-
-    dec rcx
-    jnz next_vp8_masked_predictor_wmt
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;unsigned int vp8_masked_predictor_uv_wmt(
-;    unsigned char *masked,
-;    unsigned char *unmasked,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    unsigned char *mask)
-global sym(vp8_masked_predictor_uv_wmt) PRIVATE
-sym(vp8_masked_predictor_uv_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(1) ;ref_ptr
-
-    mov             rbx,        arg(5) ;mask
-    movsxd          rax,        dword ptr arg(2) ;src_stride
-    mov             r11,        arg(3) ; destination
-    movsxd          rdx,        dword ptr arg(4) ;dst_stride
-
-    mov             rcx,        8
-
-    pxor            xmm3,       xmm3
-
-next_vp8_masked_predictor_uv_wmt:
-    movq            xmm0,       [rsi]
-    movq            xmm1,       [rdi]
-    movq            xmm2,       [rbx]
-
-    pand            xmm0,       xmm2
-    pandn           xmm2,       xmm1
-    por             xmm0,       xmm2
-    movq            [r11],      xmm0
-
-    add             r11, rdx
-    add             rsi, rax
-    add             rdi, rax
-    add             rbx,  8
-
-    dec rcx
-    jnz next_vp8_masked_predictor_uv_wmt
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_uv_from_y_mask(
-;    unsigned char *ymask,
-;    unsigned char *uvmask)
-global sym(vp8_uv_from_y_mask) PRIVATE
-sym(vp8_uv_from_y_mask):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(1) ;dst_ptr
-
-
-    mov             rcx,        8
-
-    pxor            xmm3,       xmm3
-
-next_p8_uv_from_y_mask:
-    movdqu          xmm0,       [rsi]
-    pshufb          xmm0, [shuf1b] ;[GLOBAL(shuf1b)]
-    movq            [rdi],xmm0
-    add             rdi, 8
-    add             rsi,32
-
-    dec rcx
-    jnz next_p8_uv_from_y_mask
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-shuf1b:
-    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
-
diff --git a/libvpx/vp9/common/x86/vp9_recon_mmx.asm b/libvpx/vp9/common/x86/vp9_recon_mmx.asm
deleted file mode 100644
index 6fbbe48..0000000
--- a/libvpx/vp9/common/x86/vp9_recon_mmx.asm
+++ /dev/null
@@ -1,272 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void copy_mem8x8_mmx(
-;    unsigned char *src,
-;    int src_stride,
-;    unsigned char *dst,
-;    int dst_stride
-;    )
-global sym(vp9_copy_mem8x8_mmx) PRIVATE
-sym(vp9_copy_mem8x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src;
-        movq        mm0,        [rsi]
-
-        movsxd      rax,        dword ptr arg(1) ;src_stride;
-        mov         rdi,        arg(2) ;dst;
-
-        movq        mm1,        [rsi+rax]
-        movq        mm2,        [rsi+rax*2]
-
-        movsxd      rcx,        dword ptr arg(3) ;dst_stride
-        lea         rsi,        [rsi+rax*2]
-
-        movq        [rdi],      mm0
-        add         rsi,        rax
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx*2],    mm2
-
-
-        lea         rdi,        [rdi+rcx*2]
-        movq        mm3,        [rsi]
-
-        add         rdi,        rcx
-        movq        mm4,        [rsi+rax]
-
-        movq        mm5,        [rsi+rax*2]
-        movq        [rdi],      mm3
-
-        lea         rsi,        [rsi+rax*2]
-        movq        [rdi+rcx],  mm4
-
-        movq        [rdi+rcx*2],    mm5
-        lea         rdi,        [rdi+rcx*2]
-
-        movq        mm0,        [rsi+rax]
-        movq        mm1,        [rsi+rax*2]
-
-        movq        [rdi+rcx],  mm0
-        movq        [rdi+rcx*2],mm1
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void copy_mem8x4_mmx(
-;    unsigned char *src,
-;    int src_stride,
-;    unsigned char *dst,
-;    int dst_stride
-;    )
-global sym(vp9_copy_mem8x4_mmx) PRIVATE
-sym(vp9_copy_mem8x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src;
-        movq        mm0,        [rsi]
-
-        movsxd      rax,        dword ptr arg(1) ;src_stride;
-        mov         rdi,        arg(2) ;dst;
-
-        movq        mm1,        [rsi+rax]
-        movq        mm2,        [rsi+rax*2]
-
-        movsxd      rcx,        dword ptr arg(3) ;dst_stride
-        lea         rsi,        [rsi+rax*2]
-
-        movq        [rdi],      mm0
-        movq        [rdi+rcx],      mm1
-
-        movq        [rdi+rcx*2],    mm2
-        lea         rdi,        [rdi+rcx*2]
-
-        movq        mm3,        [rsi+rax]
-        movq        [rdi+rcx],      mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void copy_mem16x16_mmx(
-;    unsigned char *src,
-;    int src_stride,
-;    unsigned char *dst,
-;    int dst_stride
-;    )
-global sym(vp9_copy_mem16x16_mmx) PRIVATE
-sym(vp9_copy_mem16x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src;
-        movsxd      rax,        dword ptr arg(1) ;src_stride;
-
-        mov         rdi,        arg(2) ;dst;
-        movsxd      rcx,        dword ptr arg(3) ;dst_stride
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/libvpx/vp9/common/x86/vp9_recon_sse2.asm b/libvpx/vp9/common/x86/vp9_recon_sse2.asm
deleted file mode 100644
index 9ee3043..0000000
--- a/libvpx/vp9/common/x86/vp9_recon_sse2.asm
+++ /dev/null
@@ -1,572 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void copy_mem16x16_sse2(
-;    unsigned char *src,
-;    int src_stride,
-;    unsigned char *dst,
-;    int dst_stride
-;    )
-global sym(vp9_copy_mem16x16_sse2) PRIVATE
-sym(vp9_copy_mem16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src;
-        movdqu      xmm0,       [rsi]
-
-        movsxd      rax,        dword ptr arg(1) ;src_stride;
-        mov         rdi,        arg(2) ;dst;
-
-        movdqu      xmm1,       [rsi+rax]
-        movdqu      xmm2,       [rsi+rax*2]
-
-        movsxd      rcx,        dword ptr arg(3) ;dst_stride
-        lea         rsi,        [rsi+rax*2]
-
-        movdqa      [rdi],      xmm0
-        add         rsi,        rax
-
-        movdqa      [rdi+rcx],  xmm1
-        movdqa      [rdi+rcx*2],xmm2
-
-        lea         rdi,        [rdi+rcx*2]
-        movdqu      xmm3,       [rsi]
-
-        add         rdi,        rcx
-        movdqu      xmm4,       [rsi+rax]
-
-        movdqu      xmm5,       [rsi+rax*2]
-        lea         rsi,        [rsi+rax*2]
-
-        movdqa      [rdi],  xmm3
-        add         rsi,        rax
-
-        movdqa      [rdi+rcx],  xmm4
-        movdqa      [rdi+rcx*2],xmm5
-
-        lea         rdi,        [rdi+rcx*2]
-        movdqu      xmm0,       [rsi]
-
-        add         rdi,        rcx
-        movdqu      xmm1,       [rsi+rax]
-
-        movdqu      xmm2,       [rsi+rax*2]
-        lea         rsi,        [rsi+rax*2]
-
-        movdqa      [rdi],      xmm0
-        add         rsi,        rax
-
-        movdqa      [rdi+rcx],  xmm1
-
-        movdqa      [rdi+rcx*2],    xmm2
-        movdqu      xmm3,       [rsi]
-
-        movdqu      xmm4,       [rsi+rax]
-        lea         rdi,        [rdi+rcx*2]
-
-        add         rdi,        rcx
-        movdqu      xmm5,       [rsi+rax*2]
-
-        lea         rsi,        [rsi+rax*2]
-        movdqa      [rdi],  xmm3
-
-        add         rsi,        rax
-        movdqa      [rdi+rcx],  xmm4
-
-        movdqa      [rdi+rcx*2],xmm5
-        movdqu      xmm0,       [rsi]
-
-        lea         rdi,        [rdi+rcx*2]
-        movdqu      xmm1,       [rsi+rax]
-
-        add         rdi,        rcx
-        movdqu      xmm2,       [rsi+rax*2]
-
-        lea         rsi,        [rsi+rax*2]
-        movdqa      [rdi],      xmm0
-
-        movdqa      [rdi+rcx],  xmm1
-        movdqa      [rdi+rcx*2],xmm2
-
-        movdqu      xmm3,       [rsi+rax]
-        lea         rdi,        [rdi+rcx*2]
-
-        movdqa      [rdi+rcx],  xmm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_intra_pred_uv_dc_mmx2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-global sym(vp9_intra_pred_uv_dc_mmx2) PRIVATE
-sym(vp9_intra_pred_uv_dc_mmx2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; from top
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
-    pxor        mm0,        mm0
-    movq        mm1,        [rsi]
-    psadbw      mm1,        mm0
-
-    ; from left
-    dec         rsi
-    lea         rdi,        [rax*3]
-    movzx       ecx,        byte [rsi+rax]
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-    lea         rsi,        [rsi+rax*4]
-    movzx       edx,        byte [rsi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*4]
-    add         ecx,        edx
-
-    ; add up
-    pextrw      edx,        mm1, 0x0
-    lea         edx,        [edx+ecx+8]
-    sar         edx,        4
-    movd        mm1,        edx
-    pshufw      mm1,        mm1, 0x0
-    packuswb    mm1,        mm1
-
-    ; write out
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-    lea         rax,        [rcx*3]
-
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-    lea         rdi,        [rdi+rcx*4]
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_intra_pred_uv_dctop_mmx2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-global sym(vp9_intra_pred_uv_dctop_mmx2) PRIVATE
-sym(vp9_intra_pred_uv_dctop_mmx2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; from top
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
-    pxor        mm0,        mm0
-    movq        mm1,        [rsi]
-    psadbw      mm1,        mm0
-
-    ; add up
-    paddw       mm1,        [GLOBAL(dc_4)]
-    psraw       mm1,        3
-    pshufw      mm1,        mm1, 0x0
-    packuswb    mm1,        mm1
-
-    ; write out
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-    lea         rax,        [rcx*3]
-
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-    lea         rdi,        [rdi+rcx*4]
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_intra_pred_uv_dcleft_mmx2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-global sym(vp9_intra_pred_uv_dcleft_mmx2) PRIVATE
-sym(vp9_intra_pred_uv_dcleft_mmx2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; from left
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    dec         rsi
-    lea         rdi,        [rax*3]
-    movzx       ecx,        byte [rsi]
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-    lea         rsi,        [rsi+rax*4]
-    movzx       edx,        byte [rsi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    lea         edx,        [ecx+edx+4]
-
-    ; add up
-    shr         edx,        3
-    movd        mm1,        edx
-    pshufw      mm1,        mm1, 0x0
-    packuswb    mm1,        mm1
-
-    ; write out
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-    lea         rax,        [rcx*3]
-
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-    lea         rdi,        [rdi+rcx*4]
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_intra_pred_uv_dc128_mmx(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-global sym(vp9_intra_pred_uv_dc128_mmx) PRIVATE
-sym(vp9_intra_pred_uv_dc128_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    GET_GOT     rbx
-    ; end prolog
-
-    ; write out
-    movq        mm1,        [GLOBAL(dc_128)]
-    mov         rax,        arg(0) ;dst;
-    movsxd      rdx,        dword ptr arg(1) ;dst_stride
-    lea         rcx,        [rdx*3]
-
-    movq [rax      ],       mm1
-    movq [rax+rdx  ],       mm1
-    movq [rax+rdx*2],       mm1
-    movq [rax+rcx  ],       mm1
-    lea         rax,        [rax+rdx*4]
-    movq [rax      ],       mm1
-    movq [rax+rdx  ],       mm1
-    movq [rax+rdx*2],       mm1
-    movq [rax+rcx  ],       mm1
-
-    ; begin epilog
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_intra_pred_uv_tm_sse2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-%macro vp9_intra_pred_uv_tm 1
-global sym(vp9_intra_pred_uv_tm_%1) PRIVATE
-sym(vp9_intra_pred_uv_tm_%1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; read top row
-    mov         edx,        4
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
-    pxor        xmm0,       xmm0
-%ifidn %1, ssse3
-    movdqa      xmm2,       [GLOBAL(dc_1024)]
-%endif
-    movq        xmm1,       [rsi]
-    punpcklbw   xmm1,       xmm0
-
-    ; set up left ptrs ans subtract topleft
-    movd        xmm3,       [rsi-1]
-    lea         rsi,        [rsi+rax-1]
-%ifidn %1, sse2
-    punpcklbw   xmm3,       xmm0
-    pshuflw     xmm3,       xmm3, 0x0
-    punpcklqdq  xmm3,       xmm3
-%else
-    pshufb      xmm3,       xmm2
-%endif
-    psubw       xmm1,       xmm3
-
-    ; set up dest ptrs
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-
-.vp9_intra_pred_uv_tm_%1_loop:
-    movd        xmm3,       [rsi]
-    movd        xmm5,       [rsi+rax]
-%ifidn %1, sse2
-    punpcklbw   xmm3,       xmm0
-    punpcklbw   xmm5,       xmm0
-    pshuflw     xmm3,       xmm3, 0x0
-    pshuflw     xmm5,       xmm5, 0x0
-    punpcklqdq  xmm3,       xmm3
-    punpcklqdq  xmm5,       xmm5
-%else
-    pshufb      xmm3,       xmm2
-    pshufb      xmm5,       xmm2
-%endif
-    paddw       xmm3,       xmm1
-    paddw       xmm5,       xmm1
-    packuswb    xmm3,       xmm5
-    movq  [rdi    ],        xmm3
-    movhps[rdi+rcx],        xmm3
-    lea         rsi,        [rsi+rax*2]
-    lea         rdi,        [rdi+rcx*2]
-    dec         edx
-    jnz .vp9_intra_pred_uv_tm_%1_loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endmacro
-
-vp9_intra_pred_uv_tm sse2
-vp9_intra_pred_uv_tm ssse3
-
-;void vp9_intra_pred_uv_ve_mmx(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-global sym(vp9_intra_pred_uv_ve_mmx) PRIVATE
-sym(vp9_intra_pred_uv_ve_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    ; end prolog
-
-    ; read from top
-    mov         rax,        arg(2) ;src;
-    movsxd      rdx,        dword ptr arg(3) ;src_stride;
-    sub         rax,        rdx
-    movq        mm1,        [rax]
-
-    ; write out
-    mov         rax,        arg(0) ;dst;
-    movsxd      rdx,        dword ptr arg(1) ;dst_stride
-    lea         rcx,        [rdx*3]
-
-    movq [rax      ],       mm1
-    movq [rax+rdx  ],       mm1
-    movq [rax+rdx*2],       mm1
-    movq [rax+rcx  ],       mm1
-    lea         rax,        [rax+rdx*4]
-    movq [rax      ],       mm1
-    movq [rax+rdx  ],       mm1
-    movq [rax+rdx*2],       mm1
-    movq [rax+rcx  ],       mm1
-
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_intra_pred_uv_ho_mmx2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-%macro vp9_intra_pred_uv_ho 1
-global sym(vp9_intra_pred_uv_ho_%1) PRIVATE
-sym(vp9_intra_pred_uv_ho_%1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-%ifidn %1, ssse3
-%ifndef GET_GOT_SAVE_ARG
-    push        rbx
-%endif
-    GET_GOT     rbx
-%endif
-    ; end prolog
-
-    ; read from left and write out
-%ifidn %1, mmx2
-    mov         edx,        4
-%endif
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-%ifidn %1, ssse3
-    lea         rdx,        [rcx*3]
-    movdqa      xmm2,       [GLOBAL(dc_00001111)]
-    lea         rbx,        [rax*3]
-%endif
-    dec         rsi
-%ifidn %1, mmx2
-.vp9_intra_pred_uv_ho_%1_loop:
-    movd        mm0,        [rsi]
-    movd        mm1,        [rsi+rax]
-    punpcklbw   mm0,        mm0
-    punpcklbw   mm1,        mm1
-    pshufw      mm0,        mm0, 0x0
-    pshufw      mm1,        mm1, 0x0
-    movq  [rdi    ],        mm0
-    movq  [rdi+rcx],        mm1
-    lea         rsi,        [rsi+rax*2]
-    lea         rdi,        [rdi+rcx*2]
-    dec         edx
-    jnz .vp9_intra_pred_uv_ho_%1_loop
-%else
-    movd        xmm0,       [rsi]
-    movd        xmm3,       [rsi+rax]
-    movd        xmm1,       [rsi+rax*2]
-    movd        xmm4,       [rsi+rbx]
-    punpcklbw   xmm0,       xmm3
-    punpcklbw   xmm1,       xmm4
-    pshufb      xmm0,       xmm2
-    pshufb      xmm1,       xmm2
-    movq   [rdi    ],       xmm0
-    movhps [rdi+rcx],       xmm0
-    movq [rdi+rcx*2],       xmm1
-    movhps [rdi+rdx],       xmm1
-    lea         rsi,        [rsi+rax*4]
-    lea         rdi,        [rdi+rcx*4]
-    movd        xmm0,       [rsi]
-    movd        xmm3,       [rsi+rax]
-    movd        xmm1,       [rsi+rax*2]
-    movd        xmm4,       [rsi+rbx]
-    punpcklbw   xmm0,       xmm3
-    punpcklbw   xmm1,       xmm4
-    pshufb      xmm0,       xmm2
-    pshufb      xmm1,       xmm2
-    movq   [rdi    ],       xmm0
-    movhps [rdi+rcx],       xmm0
-    movq [rdi+rcx*2],       xmm1
-    movhps [rdi+rdx],       xmm1
-%endif
-
-    ; begin epilog
-%ifidn %1, ssse3
-    RESTORE_GOT
-%ifndef GET_GOT_SAVE_ARG
-    pop         rbx
-%endif
-%endif
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endmacro
-
-vp9_intra_pred_uv_ho mmx2
-vp9_intra_pred_uv_ho ssse3
-
-SECTION_RODATA
-dc_128:
-    times 8 db 128
-dc_4:
-    times 4 dw 4
-align 16
-dc_1024:
-    times 8 dw 0x400
-align 16
-dc_00001111:
-    times 8 db 0
-    times 8 db 1
diff --git a/libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c b/libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c
deleted file mode 100644
index 97148fb..0000000
--- a/libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_blockd.h"
-
-#define build_intra_predictors_mbuv_prototype(sym) \
-  void sym(unsigned char *dst, int dst_stride, \
-           const unsigned char *src, int src_stride)
-typedef build_intra_predictors_mbuv_prototype((*build_intra_pred_mbuv_fn_t));
-
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dctop_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dcleft_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc128_mmx);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_ssse3);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ve_mmx);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_sse2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_ssse3);
-
-static void build_intra_predictors_mbuv_x86(MACROBLOCKD *xd,
-                                            unsigned char *dst_u,
-                                            unsigned char *dst_v,
-                                            int dst_stride,
-                                            build_intra_pred_mbuv_fn_t tm_fn,
-                                            build_intra_pred_mbuv_fn_t ho_fn) {
-  int mode = xd->mode_info_context->mbmi.uv_mode;
-  build_intra_pred_mbuv_fn_t fn;
-  int src_stride = xd->plane[1].dst.stride;
-
-  switch (mode) {
-    case  V_PRED:
-      fn = vp9_intra_pred_uv_ve_mmx;
-      break;
-    case  H_PRED:
-      fn = ho_fn;
-      break;
-    case TM_PRED:
-      fn = tm_fn;
-      break;
-    case DC_PRED:
-      if (xd->up_available) {
-        if (xd->left_available) {
-          fn = vp9_intra_pred_uv_dc_mmx2;
-          break;
-        } else {
-          fn = vp9_intra_pred_uv_dctop_mmx2;
-          break;
-        }
-      } else if (xd->left_available) {
-        fn = vp9_intra_pred_uv_dcleft_mmx2;
-        break;
-      } else {
-        fn = vp9_intra_pred_uv_dc128_mmx;
-        break;
-      }
-      break;
-    default:
-      return;
-  }
-
-  fn(dst_u, dst_stride, xd->plane[1].dst.buf, src_stride);
-  fn(dst_v, dst_stride, xd->plane[2].dst.buf, src_stride);
-}
-
-void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) {
-  build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,
-                                  xd->plane[2].dst.buf, xd->plane[1].dst.stride,
-                                  vp9_intra_pred_uv_tm_sse2,
-                                  vp9_intra_pred_uv_ho_mmx2);
-}
-
-void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) {
-  build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,
-                                  xd->plane[2].dst.buf, xd->plane[1].dst.stride,
-                                  vp9_intra_pred_uv_tm_ssse3,
-                                  vp9_intra_pred_uv_ho_ssse3);
-}
-
-void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) {
-  build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,
-                                  xd->plane[2].dst.buf, xd->plane[1].dst.stride,
-                                  vp9_intra_pred_uv_tm_sse2,
-                                  vp9_intra_pred_uv_ho_mmx2);
-}
-
-void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) {
-  build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,
-                                  xd->plane[2].dst.buf, xd->plane[1].dst.stride,
-                                  vp9_intra_pred_uv_tm_ssse3,
-                                  vp9_intra_pred_uv_ho_ssse3);
-}
diff --git a/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c b/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c
deleted file mode 100644
index ed873a5..0000000
--- a/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h>  /* SSE2 */
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-unsigned int vp9_sad16x3_sse2(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride) {
-  __m128i s0, s1, s2;
-  __m128i r0, r1, r2;
-  __m128i sad;
-
-  s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride));
-  s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride));
-  s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));
-
-  r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * ref_stride));
-  r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * ref_stride));
-  r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride));
-
-  sad = _mm_sad_epu8(s0, r0);
-  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s1, r1));
-  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s2, r2));
-  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));
-
-  return _mm_cvtsi128_si32(sad);
-}
-
-unsigned int vp9_sad3x16_sse2(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride) {
-  int r;
-  __m128i s0, s1, s2, s3;
-  __m128i r0, r1, r2, r3;
-  __m128i sad = _mm_setzero_si128();
-  __m128i mask;
-  const int offset = (uintptr_t)src_ptr & 3;
-
-  /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off.
-   * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd
-   * takes much less time.
-   */
-  if (offset == 1)
-    src_ptr -= 1;
-
-  /* mask = 0xffffffffffff0000ffffffffffff0000 */
-  mask = _mm_cmpeq_epi32(sad, sad);
-  mask = _mm_slli_epi64(mask, 16);
-
-  for (r = 0; r < 16; r += 4) {
-    s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride));
-    s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));
-    s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride));
-    s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride));
-    r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride));
-    r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride));
-    r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride));
-    r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride));
-
-    s0 = _mm_unpacklo_epi8(s0, s1);
-    r0 = _mm_unpacklo_epi8(r0, r1);
-    s2 = _mm_unpacklo_epi8(s2, s3);
-    r2 = _mm_unpacklo_epi8(r2, r3);
-    s0 = _mm_unpacklo_epi64(s0, s2);
-    r0 = _mm_unpacklo_epi64(r0, r2);
-
-    // throw out extra byte
-    if (offset == 1)
-      s0 = _mm_and_si128(s0, mask);
-    else
-      s0 = _mm_slli_epi64(s0, 16);
-    r0 = _mm_slli_epi64(r0, 16);
-
-    sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));
-
-    src_ptr += src_stride*4;
-    ref_ptr += ref_stride*4;
-  }
-
-  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));
-  return _mm_cvtsi128_si32(sad);
-}
diff --git a/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm b/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm
new file mode 100644
index 0000000..174e747
--- /dev/null
+++ b/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm
@@ -0,0 +1,230 @@
+;
+;   Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;   Use of this source code is governed by a BSD-style license
+;   that can be found in the LICENSE file in the root of the source
+;   tree. An additional intellectual property rights grant can be found
+;   in the file PATENTS.  All contributing project authors may
+;   be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp9_add_constant_residual_8x8_neon|
+    EXPORT |vp9_add_constant_residual_16x16_neon|
+    EXPORT |vp9_add_constant_residual_32x32_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    MACRO
+    LD_16x8 $src, $stride
+    vld1.8              {q8},       [$src],     $stride
+    vld1.8              {q9},       [$src],     $stride
+    vld1.8              {q10},      [$src],     $stride
+    vld1.8              {q11},      [$src],     $stride
+    vld1.8              {q12},      [$src],     $stride
+    vld1.8              {q13},      [$src],     $stride
+    vld1.8              {q14},      [$src],     $stride
+    vld1.8              {q15},      [$src],     $stride
+    MEND
+
+    MACRO
+    ADD_DIFF_16x8 $diff
+    vqadd.u8            q8,         q8,         $diff
+    vqadd.u8            q9,         q9,         $diff
+    vqadd.u8            q10,        q10,        $diff
+    vqadd.u8            q11,        q11,        $diff
+    vqadd.u8            q12,        q12,        $diff
+    vqadd.u8            q13,        q13,        $diff
+    vqadd.u8            q14,        q14,        $diff
+    vqadd.u8            q15,        q15,        $diff
+    MEND
+
+    MACRO
+    SUB_DIFF_16x8 $diff
+    vqsub.u8            q8,         q8,         $diff
+    vqsub.u8            q9,         q9,         $diff
+    vqsub.u8            q10,        q10,        $diff
+    vqsub.u8            q11,        q11,        $diff
+    vqsub.u8            q12,        q12,        $diff
+    vqsub.u8            q13,        q13,        $diff
+    vqsub.u8            q14,        q14,        $diff
+    vqsub.u8            q15,        q15,        $diff
+    MEND
+
+    MACRO
+    ST_16x8 $dst, $stride
+    vst1.8              {q8},       [$dst],     $stride
+    vst1.8              {q9},       [$dst],     $stride
+    vst1.8              {q10},      [$dst],     $stride
+    vst1.8              {q11},      [$dst],     $stride
+    vst1.8              {q12},      [$dst],     $stride
+    vst1.8              {q13},      [$dst],     $stride
+    vst1.8              {q14},      [$dst],     $stride
+    vst1.8              {q15},      [$dst],     $stride
+    MEND
+
+; void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
+;                             int width, int height) {
+;  int r, c;
+;
+;  for (r = 0; r < height; r++) {
+;    for (c = 0; c < width; c++)
+;      dest[c] = clip_pixel(diff + dest[c]);
+;
+;    dest += stride;
+;  }
+;}
+;void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest,
+;                                     int stride) {
+;  add_constant_residual(diff, dest, stride, 8, 8);
+;}
+;       r0      : const int16_t diff
+;       r1      : const uint8_t *dest
+;       r2      : int stride
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp9_add_constant_residual_8x8_neon| PROC
+    mov                 r3,         r1                      ; r3: save dest to r3
+    vld1.8              {d0},       [r1],       r2
+    vld1.8              {d1},       [r1],       r2
+    vld1.8              {d2},       [r1],       r2
+    vld1.8              {d3},       [r1],       r2
+    vld1.8              {d4},       [r1],       r2
+    vld1.8              {d5},       [r1],       r2
+    vld1.8              {d6},       [r1],       r2
+    vld1.8              {d7},       [r1],       r2
+    cmp                 r0,         #0
+    bge                 DIFF_POSITIVE_8x8
+
+DIFF_NEGATIVE_8x8                                           ; diff < 0
+    neg                 r0,         r0
+    usat                r0,         #8,         r0
+    vdup.u8             q8,         r0
+
+    vqsub.u8            q0,         q0,         q8
+    vqsub.u8            q1,         q1,         q8
+    vqsub.u8            q2,         q2,         q8
+    vqsub.u8            q3,         q3,         q8
+    b                   DIFF_SAVE_8x8
+
+DIFF_POSITIVE_8x8                                           ; diff >= 0
+    usat                r0,         #8,         r0
+    vdup.u8             q8,         r0
+
+    vqadd.u8            q0,         q0,         q8
+    vqadd.u8            q1,         q1,         q8
+    vqadd.u8            q2,         q2,         q8
+    vqadd.u8            q3,         q3,         q8
+
+DIFF_SAVE_8x8
+    vst1.8              {d0},       [r3],       r2
+    vst1.8              {d1},       [r3],       r2
+    vst1.8              {d2},       [r3],       r2
+    vst1.8              {d3},       [r3],       r2
+    vst1.8              {d4},       [r3],       r2
+    vst1.8              {d5},       [r3],       r2
+    vst1.8              {d6},       [r3],       r2
+    vst1.8              {d7},       [r3],       r2
+
+    bx                  lr
+    ENDP
+
+;void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest,
+;                                       int stride) {
+;  add_constant_residual(diff, dest, stride, 16, 16);
+;}
+;       r0      : const int16_t diff
+;       r1      : const uint8_t *dest
+;       r2      : int stride
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp9_add_constant_residual_16x16_neon| PROC
+    mov                 r3,         r1
+    LD_16x8             r1,         r2
+    cmp                 r0,         #0
+    bge                 DIFF_POSITIVE_16x16
+
+|DIFF_NEGATIVE_16x16|
+    neg                 r0,         r0
+    usat                r0,         #8,         r0
+    vdup.u8             q0,         r0
+
+    SUB_DIFF_16x8       q0
+    ST_16x8             r3,         r2
+    LD_16x8             r1,         r2
+    SUB_DIFF_16x8       q0
+    b                   DIFF_SAVE_16x16
+
+|DIFF_POSITIVE_16x16|
+    usat                r0,         #8,         r0
+    vdup.u8             q0,         r0
+
+    ADD_DIFF_16x8       q0
+    ST_16x8             r3,         r2
+    LD_16x8             r1,         r2
+    ADD_DIFF_16x8       q0
+
+|DIFF_SAVE_16x16|
+    ST_16x8             r3,         r2
+    bx                  lr
+    ENDP
+
+;void vp9_add_constant_residual_32x32_c(const int16_t diff, uint8_t *dest,
+;                                       int stride) {
+;  add_constant_residual(diff, dest, stride, 32, 32);
+;}
+;       r0      : const int16_t diff
+;       r1      : const uint8_t *dest
+;       r2      : int stride
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp9_add_constant_residual_32x32_neon| PROC
+    push                {r4,lr}
+    pld                 [r1]
+    mov                 r3,         r1
+    add                 r4,         r1,         #16         ; r4 dest + 16 for second loop
+    cmp                 r0,         #0
+    bge                 DIFF_POSITIVE_32x32
+
+|DIFF_NEGATIVE_32x32|
+    neg                 r0,         r0
+    usat                r0,         #8,         r0
+    vdup.u8             q0,         r0
+    mov                 r0,         #4
+
+|DIFF_NEGATIVE_32x32_LOOP|
+    sub                 r0,         #1
+    LD_16x8             r1,         r2
+    SUB_DIFF_16x8       q0
+    ST_16x8             r3,         r2
+
+    LD_16x8             r1,         r2
+    SUB_DIFF_16x8       q0
+    ST_16x8             r3,         r2
+    cmp                 r0,         #2
+    moveq               r1,         r4
+    moveq               r3,         r4
+    cmp                 r0,         #0
+    bne                 DIFF_NEGATIVE_32x32_LOOP
+    pop                 {r4,pc}
+
+|DIFF_POSITIVE_32x32|
+    usat                r0,         #8,         r0
+    vdup.u8             q0,         r0
+    mov                 r0,         #4
+
+|DIFF_POSITIVE_32x32_LOOP|
+    sub                 r0,         #1
+    LD_16x8             r1,         r2
+    ADD_DIFF_16x8       q0
+    ST_16x8             r3,         r2
+
+    LD_16x8             r1,         r2
+    ADD_DIFF_16x8       q0
+    ST_16x8             r3,         r2
+    cmp                 r0,         #2
+    moveq               r1,         r4
+    moveq               r3,         r4
+    cmp                 r0,         #0
+    bne                 DIFF_POSITIVE_32x32_LOOP
+    pop                 {r4,pc}
+    ENDP
+
+    END
diff --git a/libvpx/vp9/decoder/vp9_asm_dec_offsets.c b/libvpx/vp9/decoder/vp9_asm_dec_offsets.c
deleted file mode 100644
index e4b9c97..0000000
--- a/libvpx/vp9/decoder/vp9_asm_dec_offsets.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/asm_offsets.h"
-#include "vp9/decoder/vp9_onyxd_int.h"
-
-BEGIN
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
diff --git a/libvpx/vp9/decoder/vp9_dboolhuff.c b/libvpx/vp9/decoder/vp9_dboolhuff.c
index df77d65..31b1ae2 100644
--- a/libvpx/vp9/decoder/vp9_dboolhuff.c
+++ b/libvpx/vp9/decoder/vp9_dboolhuff.c
@@ -13,6 +13,12 @@
 
 #include "vp9/decoder/vp9_dboolhuff.h"
 
+// This is meant to be a large, positive constant that can still be efficiently
+// loaded as an immediate (on platforms like ARM, for example).
+// Even relatively modest values like 100 would work fine.
+#define VP9_LOTS_OF_BITS 0x40000000
+
+
 int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) {
   int marker_bit;
 
@@ -67,3 +73,20 @@ const uint8_t *vp9_reader_find_end(vp9_reader *r) {
   return r->buffer;
 }
 
+int vp9_reader_has_error(vp9_reader *r) {
+  // Check if we have reached the end of the buffer.
+  //
+  // Variable 'count' stores the number of bits in the 'value' buffer, minus
+  // 8. The top byte is part of the algorithm, and the remainder is buffered
+  // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+  // occupied, 8 for the algorithm and 8 in the buffer.
+  //
+  // When reading a byte from the user's buffer, count is filled with 8 and
+  // one byte is filled into the value buffer. When we reach the end of the
+  // data, count is additionally filled with VP9_LOTS_OF_BITS. So when
+  // count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted.
+  //
+  // 1 if we have tried to decode bits after the end of stream was encountered.
+  // 0 No error.
+  return r->count > VP9_BD_VALUE_SIZE && r->count < VP9_LOTS_OF_BITS;
+}
diff --git a/libvpx/vp9/decoder/vp9_dboolhuff.h b/libvpx/vp9/decoder/vp9_dboolhuff.h
index b50aa35..c46dd73 100644
--- a/libvpx/vp9/decoder/vp9_dboolhuff.h
+++ b/libvpx/vp9/decoder/vp9_dboolhuff.h
@@ -22,11 +22,6 @@ typedef size_t VP9_BD_VALUE;
 
 #define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
 
-// This is meant to be a large, positive constant that can still be efficiently
-// loaded as an immediate (on platforms like ARM, for example).
-// Even relatively modest values like 100 would work fine.
-#define VP9_LOTS_OF_BITS 0x40000000
-
 typedef struct {
   const uint8_t *buffer_end;
   const uint8_t *buffer;
@@ -93,22 +88,6 @@ static int vp9_read_literal(vp9_reader *br, int bits) {
   return z;
 }
 
-static int vp9_reader_has_error(vp9_reader *r) {
-  // Check if we have reached the end of the buffer.
-  //
-  // Variable 'count' stores the number of bits in the 'value' buffer, minus
-  // 8. The top byte is part of the algorithm, and the remainder is buffered
-  // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
-  // occupied, 8 for the algorithm and 8 in the buffer.
-  //
-  // When reading a byte from the user's buffer, count is filled with 8 and
-  // one byte is filled into the value buffer. When we reach the end of the
-  // data, count is additionally filled with VP9_LOTS_OF_BITS. So when
-  // count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted.
-  //
-  // 1 if we have tried to decode bits after the end of stream was encountered.
-  // 0 No error.
-  return r->count > VP9_BD_VALUE_SIZE && r->count < VP9_LOTS_OF_BITS;
-}
+int vp9_reader_has_error(vp9_reader *r);
 
 #endif  // VP9_DECODER_VP9_DBOOLHUFF_H_
diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c
index b3d41be..6f0044a 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libvpx/vp9/decoder/vp9_decodemv.c
@@ -8,151 +8,188 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 
-#include "vp9/decoder/vp9_treereader.h"
-#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_findnearmv.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_seg_common.h"
+
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decodframe.h"
-#include "vp9/common/vp9_mvref_common.h"
-#if CONFIG_DEBUG
-#include <assert.h>
-#endif
+#include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/decoder/vp9_dsubexp.h"
+#include "vp9/decoder/vp9_treereader.h"
+
+static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
+  return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p);
+}
 
-// #define DEBUG_DEC_MV
-#ifdef DEBUG_DEC_MV
-int dec_mvcount = 0;
-#endif
+static MB_PREDICTION_MODE read_inter_mode(vp9_reader *r, const vp9_prob *p) {
+  return (MB_PREDICTION_MODE)treed_read(r, vp9_inter_mode_tree, p);
+}
 
-// #define DEC_DEBUG
-#ifdef DEC_DEBUG
-extern int dec_debug;
-#endif
+static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
+  return treed_read(r, vp9_segment_tree, seg->tree_probs);
+}
 
-static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
-  MB_PREDICTION_MODE m = treed_read(r, vp9_intra_mode_tree, p);
-  return m;
+static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                     BLOCK_SIZE_TYPE bsize, vp9_reader *r) {
+  const uint8_t context = vp9_get_pred_context_tx_size(xd);
+  const vp9_prob *tx_probs = get_tx_probs(bsize, context, &cm->fc.tx_probs);
+  TX_SIZE tx_size = vp9_read(r, tx_probs[0]);
+  if (tx_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) {
+    tx_size += vp9_read(r, tx_probs[1]);
+    if (tx_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32)
+      tx_size += vp9_read(r, tx_probs[2]);
+  }
+
+  update_tx_counts(bsize, context, tx_size, &cm->counts.tx);
+  return tx_size;
 }
 
-static int read_mb_segid(vp9_reader *r, MACROBLOCKD *xd) {
-  return treed_read(r, vp9_segment_tree, xd->mb_segment_tree_probs);
+static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode,
+                            BLOCK_SIZE_TYPE bsize, int select_cond,
+                            vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+
+  if (tx_mode == TX_MODE_SELECT && bsize >= BLOCK_SIZE_SB8X8 && select_cond)
+    return read_selected_tx_size(cm, xd, bsize, r);
+  else if (tx_mode >= ALLOW_32X32 && bsize >= BLOCK_SIZE_SB32X32)
+    return TX_32X32;
+  else if (tx_mode >= ALLOW_16X16 && bsize >= BLOCK_SIZE_MB16X16)
+    return TX_16X16;
+  else if (tx_mode >= ALLOW_8X8 && bsize >= BLOCK_SIZE_SB8X8)
+    return TX_8X8;
+  else
+    return TX_4X4;
 }
 
-static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi,
+static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
                            int mi_row, int mi_col, int segment_id) {
-  const int mi_index = mi_row * cm->mi_cols + mi_col;
-  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
-  const int bw = 1 << mi_width_log2(sb_type);
-  const int bh = 1 << mi_height_log2(sb_type);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = 1 << mi_width_log2(bsize);
+  const int bh = 1 << mi_height_log2(bsize);
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
   int x, y;
 
-  for (y = 0; y < ymis; y++) {
-    for (x = 0; x < xmis; x++) {
-      const int index = mi_index + (y * cm->mi_cols + x);
-      cm->last_frame_seg_map[index] = segment_id;
-    }
-  }
-}
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
 
-static TX_SIZE select_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
-  const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);
-  const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);
-  TX_SIZE txfm_size = vp9_read(r, tx_probs[0]);
-  if (txfm_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) {
-    txfm_size += vp9_read(r, tx_probs[1]);
-    if (txfm_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32)
-      txfm_size += vp9_read(r, tx_probs[2]);
-  }
-  if (bsize >= BLOCK_SIZE_SB32X32) {
-    cm->fc.tx_count_32x32p[context][txfm_size]++;
-  } else if (bsize >= BLOCK_SIZE_MB16X16) {
-    cm->fc.tx_count_16x16p[context][txfm_size]++;
-  } else {
-    cm->fc.tx_count_8x8p[context][txfm_size]++;
-  }
-  return txfm_size;
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++)
+      cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
 }
 
+static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
+                                 vp9_reader *r) {
+  MACROBLOCKD *const xd = &pbi->mb;
+  struct segmentation *const seg = &xd->seg;
+  const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  int segment_id;
+
+  if (!seg->enabled)
+    return 0;  // Default for disabled segmentation
+
+  if (!seg->update_map)
+    return 0;
 
-static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m,
-                         int mi_row, int mi_col,
-                         vp9_reader *r) {
+  segment_id = read_segment_id(r, seg);
+  set_segment_id(&pbi->common, bsize, mi_row, mi_col, segment_id);
+  return segment_id;
+}
+
+static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
+                                 vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  const int mis = cm->mode_info_stride;
+  struct segmentation *const seg = &xd->seg;
+  const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  int pred_segment_id, segment_id;
 
-  // Read segmentation map if it is being updated explicitly this frame
-  m->mbmi.segment_id = 0;
-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
-    m->mbmi.segment_id = read_mb_segid(r, xd);
-    set_segment_id(cm, &m->mbmi, mi_row, mi_col, m->mbmi.segment_id);
-  }
+  if (!seg->enabled)
+    return 0;  // Default for disabled segmentation
 
-  m->mbmi.mb_skip_coeff = vp9_segfeature_active(xd, m->mbmi.segment_id,
-                                                SEG_LVL_SKIP);
-  if (!m->mbmi.mb_skip_coeff) {
-    m->mbmi.mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
-    cm->fc.mbskip_count[vp9_get_pred_context(cm, xd, PRED_MBSKIP)]
-                       [m->mbmi.mb_skip_coeff]++;
+  pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
+                                       bsize, mi_row, mi_col);
+  if (!seg->update_map)
+    return pred_segment_id;
+
+  if (seg->temporal_update) {
+    const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(xd);
+    const int pred_flag = vp9_read(r, pred_prob);
+    vp9_set_pred_flag_seg_id(cm, bsize, mi_row, mi_col, pred_flag);
+    segment_id = pred_flag ? pred_segment_id
+                           : read_segment_id(r, seg);
+  } else {
+    segment_id = read_segment_id(r, seg);
   }
+  set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
+  return segment_id;
+}
 
-  if (cm->txfm_mode == TX_MODE_SELECT &&
-      m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
-    m->mbmi.txfm_size = select_txfm_size(cm, xd, r, m->mbmi.sb_type);
-  } else if (cm->txfm_mode >= ALLOW_32X32 &&
-             m->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
-    m->mbmi.txfm_size = TX_32X32;
-  } else if (cm->txfm_mode >= ALLOW_16X16 &&
-             m->mbmi.sb_type >= BLOCK_SIZE_MB16X16) {
-    m->mbmi.txfm_size = TX_16X16;
-  } else if (cm->txfm_mode >= ALLOW_8X8 &&
-             m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
-    m->mbmi.txfm_size = TX_8X8;
-  } else {
-    m->mbmi.txfm_size = TX_4X4;
+static uint8_t read_skip_coeff(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  int skip_coeff = vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP);
+  if (!skip_coeff) {
+    const int ctx = vp9_get_pred_context_mbskip(xd);
+    skip_coeff = vp9_read(r, vp9_get_pred_prob_mbskip(cm, xd));
+    cm->counts.mbskip[ctx][skip_coeff]++;
   }
+  return skip_coeff;
+}
 
-  // luma mode
-  m->mbmi.ref_frame[0] = INTRA_FRAME;
-  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+static void read_intra_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
+                                 int mi_row, int mi_col, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  MB_MODE_INFO *const mbmi = &m->mbmi;
+  const BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+  const int mis = cm->mode_info_stride;
+
+  mbmi->segment_id = read_intra_segment_id(pbi, mi_row, mi_col, r);
+  mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
+  mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r);
+  mbmi->ref_frame[0] = INTRA_FRAME;
+
+  if (bsize >= BLOCK_SIZE_SB8X8) {
     const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
     const MB_PREDICTION_MODE L = xd->left_available ?
                                   left_block_mode(m, 0) : DC_PRED;
-    m->mbmi.mode = read_intra_mode(r, cm->kf_y_mode_prob[A][L]);
+    mbmi->mode = read_intra_mode(r, vp9_kf_y_mode_prob[A][L]);
   } else {
+    // Only 4x4, 4x8, 8x4 blocks
+    const int bw = 1 << b_width_log2(bsize);
+    const int bh = 1 << b_height_log2(bsize);
     int idx, idy;
-    int bw = 1 << b_width_log2(m->mbmi.sb_type);
-    int bh = 1 << b_height_log2(m->mbmi.sb_type);
 
     for (idy = 0; idy < 2; idy += bh) {
       for (idx = 0; idx < 2; idx += bw) {
-        int ib = idy * 2 + idx;
-        int k;
+        const int ib = idy * 2 + idx;
         const MB_PREDICTION_MODE A = above_block_mode(m, ib, mis);
         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
                                       left_block_mode(m, ib) : DC_PRED;
-        m->bmi[ib].as_mode.first =
-            read_intra_mode(r, cm->kf_y_mode_prob[A][L]);
-        for (k = 1; k < bh; ++k)
-          m->bmi[ib + k * 2].as_mode.first = m->bmi[ib].as_mode.first;
-        for (k = 1; k < bw; ++k)
-          m->bmi[ib + k].as_mode.first = m->bmi[ib].as_mode.first;
+        const MB_PREDICTION_MODE b_mode = read_intra_mode(r,
+                                              vp9_kf_y_mode_prob[A][L]);
+        m->bmi[ib].as_mode = b_mode;
+        if (bh == 2)
+          m->bmi[ib + 2].as_mode = b_mode;
+        if (bw == 2)
+          m->bmi[ib + 1].as_mode = b_mode;
       }
     }
-    m->mbmi.mode = m->bmi[3].as_mode.first;
+
+    mbmi->mode = m->bmi[3].as_mode;
   }
 
-  m->mbmi.uv_mode = read_intra_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]);
+  mbmi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mbmi->mode]);
 }
 
 static int read_mv_component(vp9_reader *r,
@@ -161,9 +198,10 @@ static int read_mv_component(vp9_reader *r,
   int mag, d, fr, hp;
   const int sign = vp9_read(r, mvcomp->sign);
   const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
+  const int class0 = mv_class == MV_CLASS_0;
 
   // Integer part
-  if (mv_class == MV_CLASS_0) {
+  if (class0) {
     d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);
   } else {
     int i;
@@ -176,66 +214,77 @@ static int read_mv_component(vp9_reader *r,
 
   // Fractional part
   fr = treed_read(r, vp9_mv_fp_tree,
-                  mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp);
+                  class0 ? mvcomp->class0_fp[d] : mvcomp->fp);
 
 
   // High precision part (if hp is not used, the default value of the hp is 1)
-  hp = usehp ? vp9_read(r,
-                        mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp)
+  hp = usehp ? vp9_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
              : 1;
 
-  // result
+  // Result
   mag = vp9_get_mv_mag(mv_class, (d << 3) | (fr << 1) | hp) + 1;
   return sign ? -mag : mag;
 }
 
-static void update_nmv(vp9_reader *r, vp9_prob *const p,
-                       const vp9_prob upd_p) {
-  if (vp9_read(r, upd_p)) {
-#ifdef LOW_PRECISION_MV_UPDATE
+static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
+                           const nmv_context *ctx,
+                           nmv_context_counts *counts, int usehp) {
+  const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);
+  MV diff = {0, 0};
+
+  usehp = usehp && vp9_use_mv_hp(ref);
+  if (mv_joint_vertical(j))
+    diff.row = read_mv_component(r, &ctx->comps[0], usehp);
+
+  if (mv_joint_horizontal(j))
+    diff.col = read_mv_component(r, &ctx->comps[1], usehp);
+
+  vp9_inc_mv(&diff, counts);
+
+  mv->row = ref->row + diff.row;
+  mv->col = ref->col + diff.col;
+}
+
+static void update_mv(vp9_reader *r, vp9_prob *p, vp9_prob upd_p) {
+  if (vp9_read(r, upd_p))
     *p = (vp9_read_literal(r, 7) << 1) | 1;
-#else
-    *p = (vp9_read_literal(r, 8));
-#endif
-  }
 }
 
-static void read_nmvprobs(vp9_reader *r, nmv_context *mvctx,
-                          int usehp) {
+static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int usehp) {
   int i, j, k;
 
-#ifdef MV_GROUP_UPDATE
-  if (!vp9_read_bit(r))
-    return;
-#endif
   for (j = 0; j < MV_JOINTS - 1; ++j)
-    update_nmv(r, &mvctx->joints[j], VP9_NMV_UPDATE_PROB);
+    update_mv(r, &mvc->joints[j], VP9_NMV_UPDATE_PROB);
 
   for (i = 0; i < 2; ++i) {
-    update_nmv(r, &mvctx->comps[i].sign, VP9_NMV_UPDATE_PROB);
+    nmv_component *const comp = &mvc->comps[i];
+
+    update_mv(r, &comp->sign, VP9_NMV_UPDATE_PROB);
     for (j = 0; j < MV_CLASSES - 1; ++j)
-      update_nmv(r, &mvctx->comps[i].classes[j], VP9_NMV_UPDATE_PROB);
+      update_mv(r, &comp->classes[j], VP9_NMV_UPDATE_PROB);
 
     for (j = 0; j < CLASS0_SIZE - 1; ++j)
-      update_nmv(r, &mvctx->comps[i].class0[j], VP9_NMV_UPDATE_PROB);
+      update_mv(r, &comp->class0[j], VP9_NMV_UPDATE_PROB);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      update_nmv(r, &mvctx->comps[i].bits[j], VP9_NMV_UPDATE_PROB);
+      update_mv(r, &comp->bits[j], VP9_NMV_UPDATE_PROB);
   }
 
   for (i = 0; i < 2; ++i) {
+    nmv_component *const comp = &mvc->comps[i];
+
     for (j = 0; j < CLASS0_SIZE; ++j)
       for (k = 0; k < 3; ++k)
-        update_nmv(r, &mvctx->comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
+        update_mv(r, &comp->class0_fp[j][k], VP9_NMV_UPDATE_PROB);
 
     for (j = 0; j < 3; ++j)
-      update_nmv(r, &mvctx->comps[i].fp[j], VP9_NMV_UPDATE_PROB);
+      update_mv(r, &comp->fp[j], VP9_NMV_UPDATE_PROB);
   }
 
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      update_nmv(r, &mvctx->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
-      update_nmv(r, &mvctx->comps[i].hp, VP9_NMV_UPDATE_PROB);
+      update_mv(r, &mvc->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
+      update_mv(r, &mvc->comps[i].hp, VP9_NMV_UPDATE_PROB);
     }
   }
 }
@@ -245,205 +294,71 @@ static void read_ref_frame(VP9D_COMP *pbi, vp9_reader *r,
                            int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  const int seg_ref_active = vp9_segfeature_active(xd, segment_id,
-                                                   SEG_LVL_REF_FRAME);
+  FRAME_CONTEXT *const fc = &cm->fc;
+  FRAME_COUNTS *const counts = &cm->counts;
 
-  // Segment reference frame features not available.
-  if (!seg_ref_active) {
+  if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    ref_frame[0] = vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME);
+    ref_frame[1] = NONE;
+  } else {
+    const int comp_ctx = vp9_get_pred_context_comp_inter_inter(cm, xd);
     int is_comp;
-    int comp_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_INTER_INTER);
 
     if (cm->comp_pred_mode == HYBRID_PREDICTION) {
-      is_comp = vp9_read(r, cm->fc.comp_inter_prob[comp_ctx]);
-      cm->fc.comp_inter_count[comp_ctx][is_comp]++;
+      is_comp = vp9_read(r, fc->comp_inter_prob[comp_ctx]);
+      counts->comp_inter[comp_ctx][is_comp]++;
     } else {
       is_comp = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
     }
 
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
     if (is_comp) {
-      int b, fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
-      int ref_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_REF_P);
-
-      ref_frame[fix_ref_idx]  = cm->comp_fixed_ref;
-      b = vp9_read(r, cm->fc.comp_ref_prob[ref_ctx]);
-      cm->fc.comp_ref_count[ref_ctx][b]++;
+      const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+      const int ref_ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
+      const int b = vp9_read(r, fc->comp_ref_prob[ref_ctx]);
+      counts->comp_ref[ref_ctx][b]++;
+      ref_frame[fix_ref_idx] = cm->comp_fixed_ref;
       ref_frame[!fix_ref_idx] = cm->comp_var_ref[b];
     } else {
-      int ref1_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1);
+      const int ref1_ctx = vp9_get_pred_context_single_ref_p1(xd);
       ref_frame[1] = NONE;
-      if (vp9_read(r, cm->fc.single_ref_prob[ref1_ctx][0])) {
-        int ref2_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P2);
-        int b2 = vp9_read(r, cm->fc.single_ref_prob[ref2_ctx][1]);
-        ref_frame[0] = b2 ? ALTREF_FRAME : GOLDEN_FRAME;
-        cm->fc.single_ref_count[ref1_ctx][0][1]++;
-        cm->fc.single_ref_count[ref2_ctx][1][b2]++;
+      if (vp9_read(r, fc->single_ref_prob[ref1_ctx][0])) {
+        const int ref2_ctx = vp9_get_pred_context_single_ref_p2(xd);
+        const int b = vp9_read(r, fc->single_ref_prob[ref2_ctx][1]);
+        ref_frame[0] = b ? ALTREF_FRAME : GOLDEN_FRAME;
+        counts->single_ref[ref1_ctx][0][1]++;
+        counts->single_ref[ref2_ctx][1][b]++;
       } else {
         ref_frame[0] = LAST_FRAME;
-        cm->fc.single_ref_count[ref1_ctx][0][0]++;
+        counts->single_ref[ref1_ctx][0][0]++;
       }
     }
-  } else {
-    ref_frame[0] = vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME);
-    ref_frame[1] = NONE;
   }
 }
 
-static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *r, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE) treed_read(r, vp9_sb_mv_ref_tree, p);
-}
-
-#ifdef VPX_MODE_COUNT
-unsigned int vp9_mv_cont_count[5][4] = {
-  { 0, 0, 0, 0 },
-  { 0, 0, 0, 0 },
-  { 0, 0, 0, 0 },
-  { 0, 0, 0, 0 },
-  { 0, 0, 0, 0 }
-};
-#endif
-
-static void read_switchable_interp_probs(VP9_COMMON* const cm, vp9_reader *r) {
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
-  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j)
-    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
-        cm->fc.switchable_interp_prob[j][i] =
-            // vp9_read_prob(r);
-            vp9_read_prob_diff_update(r, cm->fc.switchable_interp_prob[j][i]);
-      }
-    }
+  for (j = 0; j < VP9_SWITCHABLE_FILTERS + 1; ++j)
+    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
 }
 
-static void read_inter_mode_probs(VP9_COMMON *const cm, vp9_reader *r) {
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-    for (j = 0; j < VP9_INTER_MODES - 1; ++j) {
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
-        // cm->fc.inter_mode_probs[i][j] = vp9_read_prob(r);
-        cm->fc.inter_mode_probs[i][j] =
-            vp9_read_prob_diff_update(r, cm->fc.inter_mode_probs[i][j]);
-      }
-    }
+    for (j = 0; j < VP9_INTER_MODES - 1; ++j)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
 }
 
 static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
   COMPPREDMODE_TYPE mode = vp9_read_bit(r);
   if (mode)
-     mode += vp9_read_bit(r);
+    mode += vp9_read_bit(r);
   return mode;
 }
 
-static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
-
-  if ((cm->frame_type != KEY_FRAME) && (!cm->intra_only)) {
-    nmv_context *const nmvc = &pbi->common.fc.nmvc;
-    MACROBLOCKD *const xd = &pbi->mb;
-    int i, j;
-
-    read_inter_mode_probs(cm, r);
-
-    if (cm->mcomp_filter_type == SWITCHABLE)
-      read_switchable_interp_probs(cm, r);
-
-    for (i = 0; i < INTRA_INTER_CONTEXTS; i++) {
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-        cm->fc.intra_inter_prob[i] =
-            vp9_read_prob_diff_update(r, cm->fc.intra_inter_prob[i]);
-    }
-
-    if (cm->allow_comp_inter_inter) {
-      cm->comp_pred_mode = read_comp_pred_mode(r);
-      if (cm->comp_pred_mode == HYBRID_PREDICTION)
-        for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-            cm->fc.comp_inter_prob[i] =
-                vp9_read_prob_diff_update(r, cm->fc.comp_inter_prob[i]);
-    } else {
-      cm->comp_pred_mode = SINGLE_PREDICTION_ONLY;
-    }
-
-    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
-      for (i = 0; i < REF_CONTEXTS; i++) {
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-          cm->fc.single_ref_prob[i][0] =
-              vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][0]);
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-          cm->fc.single_ref_prob[i][1] =
-              vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][1]);
-      }
-
-    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
-      for (i = 0; i < REF_CONTEXTS; i++)
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-          cm->fc.comp_ref_prob[i] =
-              vp9_read_prob_diff_update(r, cm->fc.comp_ref_prob[i]);
-
-    // VP9_INTRA_MODES
-    for (j = 0; j < BLOCK_SIZE_GROUPS; j++) {
-      for (i = 0; i < VP9_INTRA_MODES - 1; ++i) {
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
-          cm->fc.y_mode_prob[j][i] =
-              vp9_read_prob_diff_update(r, cm->fc.y_mode_prob[j][i]);
-        }
-      }
-    }
-    for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j) {
-      for (i = 0; i < PARTITION_TYPES - 1; ++i) {
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
-          cm->fc.partition_prob[INTER_FRAME][j][i] =
-              vp9_read_prob_diff_update(r,
-                  cm->fc.partition_prob[INTER_FRAME][j][i]);
-        }
-      }
-    }
-
-    read_nmvprobs(r, nmvc, xd->allow_high_precision_mv);
-  }
-}
-
-// This function either reads the segment id for the current macroblock from
-// the bitstream or if the value is temporally predicted asserts the predicted
-// value
-static int read_mb_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
-                              vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
-  MODE_INFO *const mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-
-  if (!xd->segmentation_enabled)
-    return 0;  // Default for disabled segmentation
-
-  if (xd->update_mb_segmentation_map) {
-    int segment_id;
-
-    if (cm->temporal_update) {
-      // Temporal coding of the segment id for this mb is enabled.
-      // Get the context based probability for reading the
-      // prediction status flag
-      const vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_SEG_ID);
-      const int pred_flag = vp9_read(r, pred_prob);
-      vp9_set_pred_flag(xd, PRED_SEG_ID, pred_flag);
-
-      // If the value is flagged as correctly predicted
-      // then use the predicted value, otherwise decode it explicitly
-      segment_id = pred_flag ? vp9_get_pred_mi_segid(cm, mbmi->sb_type,
-                                                     mi_row, mi_col)
-                             : read_mb_segid(r, xd);
-    } else {
-      segment_id = read_mb_segid(r, xd);  // Normal unpredicted coding mode
-    }
-
-    set_segment_id(cm, mbmi, mi_row, mi_col, segment_id);  // Side effect
-    return segment_id;
-  } else {
-    return vp9_get_pred_mi_segid(cm, mbmi->sb_type, mi_row, mi_col);
-  }
-}
-
-
 static INLINE void assign_and_clamp_mv(int_mv *dst, const int_mv *src,
                                        int mb_to_left_edge,
                                        int mb_to_right_edge,
@@ -454,242 +369,188 @@ static INLINE void assign_and_clamp_mv(int_mv *dst, const int_mv *src,
            mb_to_bottom_edge);
 }
 
-static INLINE void decode_mv(vp9_reader *r, MV *mv, const MV *ref,
-                             const nmv_context *ctx,
-                             nmv_context_counts *counts,
-                             int usehp) {
-  const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);
-  MV diff = {0, 0};
-
-  usehp = usehp && vp9_use_nmv_hp(ref);
-  if (mv_joint_vertical(j))
-    diff.row = read_mv_component(r, &ctx->comps[0], usehp);
+static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
+    VP9D_COMP *pbi, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  const vp9_prob *probs = vp9_get_pred_probs_switchable_interp(cm, xd);
+  const int index = treed_read(r, vp9_switchable_interp_tree, probs);
+  const int ctx = vp9_get_pred_context_switchable_interp(xd);
+  ++cm->counts.switchable_interp[ctx][index];
+  return vp9_switchable_interp[index];
+}
 
-  if (mv_joint_horizontal(j))
-    diff.col = read_mv_component(r, &ctx->comps[1], usehp);
+static void read_intra_block_modes(VP9D_COMP *pbi, MODE_INFO *mi,
+                                   vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
 
-  vp9_increment_nmv(&diff, ref, counts, usehp);
+  if (bsize >= BLOCK_SIZE_SB8X8) {
+    const int size_group = MIN(3, MIN(bwl, bhl));
+    mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[size_group]);
+    cm->counts.y_mode[size_group][mbmi->mode]++;
+  } else {
+     // Only 4x4, 4x8, 8x4 blocks
+     const int bw = 1 << bwl, bh = 1 << bhl;
+     int idx, idy;
+
+     for (idy = 0; idy < 2; idy += bh) {
+       for (idx = 0; idx < 2; idx += bw) {
+         const int ib = idy * 2 + idx;
+         const int b_mode = read_intra_mode(r, cm->fc.y_mode_prob[0]);
+         mi->bmi[ib].as_mode = b_mode;
+         cm->counts.y_mode[0][b_mode]++;
+
+         if (bh == 2)
+           mi->bmi[ib + 2].as_mode = b_mode;
+         if (bw == 2)
+           mi->bmi[ib + 1].as_mode = b_mode;
+      }
+    }
+    mbmi->mode = mi->bmi[3].as_mode;
+  }
 
-  mv->row = diff.row + ref->row;
-  mv->col = diff.col + ref->col;
+  mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
+  cm->counts.uv_mode[mbmi->mode][mbmi->uv_mode]++;
 }
 
-static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
-    VP9D_COMP *pbi, vp9_reader *r) {
-  const int index = treed_read(r, vp9_switchable_interp_tree,
-                               vp9_get_pred_probs(&pbi->common, &pbi->mb,
-                                                  PRED_SWITCHABLE_INTERP));
-  ++pbi->common.fc.switchable_interp_count
-                [vp9_get_pred_context(
-                    &pbi->common, &pbi->mb, PRED_SWITCHABLE_INTERP)][index];
-  return vp9_switchable_interp[index];
+static MV_REFERENCE_FRAME read_reference_frame(VP9D_COMP *pbi, int segment_id,
+                                               vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+
+  MV_REFERENCE_FRAME ref;
+  if (!vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    const int ctx = vp9_get_pred_context_intra_inter(xd);
+    ref = (MV_REFERENCE_FRAME)
+              vp9_read(r, vp9_get_pred_prob_intra_inter(cm, xd));
+    cm->counts.intra_inter[ctx][ref != INTRA_FRAME]++;
+  } else {
+    ref = (MV_REFERENCE_FRAME) vp9_get_segdata(&xd->seg, segment_id,
+                                   SEG_LVL_REF_FRAME) != INTRA_FRAME;
+  }
+  return ref;
 }
 
-static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
-                             int mi_row, int mi_col,
-                             vp9_reader *r) {
+static void read_inter_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
+                                 int mi_row, int mi_col, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
-  nmv_context *const nmvc = &cm->fc.nmvc;
   MACROBLOCKD *const xd = &pbi->mb;
+  nmv_context *const nmvc = &cm->fc.nmvc;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   int_mv *const mv0 = &mbmi->mv[0];
   int_mv *const mv1 = &mbmi->mv[1];
-  BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
-  int bw = 1 << b_width_log2(bsize);
-  int bh = 1 << b_height_log2(bsize);
+  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  const int bw = 1 << b_width_log2(bsize);
+  const int bh = 1 << b_height_log2(bsize);
 
-  int mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge;
-  int j, idx, idy;
+  int idx, idy;
 
+  mbmi->segment_id = read_inter_segment_id(pbi, mi_row, mi_col, r);
+  mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
+  mbmi->ref_frame[0] = read_reference_frame(pbi, mbmi->segment_id, r);
   mbmi->ref_frame[1] = NONE;
+  mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, bsize,
+     (!mbmi->mb_skip_coeff || mbmi->ref_frame[0] == INTRA_FRAME), r);
 
-  // Make sure the MACROBLOCKD mode info pointer is pointed at the
-  // correct entry for the current macroblock.
-  xd->mode_info_context = mi;
-
-  // Distance of Mb to the various image edges.
-  // These specified to 8th pel as they are always compared to MV values
-  // that are in 1/8th pel units
-  set_mi_row_col(cm, xd, mi_row, 1 << mi_height_log2(bsize),
-                         mi_col, 1 << mi_width_log2(bsize));
-
-  mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
-  mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
-  mb_to_left_edge = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
-  mb_to_right_edge = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
-
-  // Read the macroblock segment id.
-  mbmi->segment_id = read_mb_segment_id(pbi, mi_row, mi_col, r);
-
-  mbmi->mb_skip_coeff = vp9_segfeature_active(xd, mbmi->segment_id,
-                                              SEG_LVL_SKIP);
-  if (!mbmi->mb_skip_coeff) {
-    mbmi->mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
-    cm->fc.mbskip_count[vp9_get_pred_context(cm, xd, PRED_MBSKIP)]
-                       [mbmi->mb_skip_coeff]++;
-  }
-
-  // Read the reference frame
-  if (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_REF_FRAME)) {
-    mbmi->ref_frame[0] =
-        vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER));
-    cm->fc.intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)]
-                            [mbmi->ref_frame[0] != INTRA_FRAME]++;
-  } else {
-    mbmi->ref_frame[0] =
-        vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
-  }
-
-  if (cm->txfm_mode == TX_MODE_SELECT &&
-      (mbmi->mb_skip_coeff == 0 || mbmi->ref_frame[0] == INTRA_FRAME) &&
-      bsize >= BLOCK_SIZE_SB8X8) {
-    mbmi->txfm_size = select_txfm_size(cm, xd, r, bsize);
-  } else if (bsize >= BLOCK_SIZE_SB32X32 &&
-             cm->txfm_mode >= ALLOW_32X32) {
-    mbmi->txfm_size = TX_32X32;
-  } else if (cm->txfm_mode >= ALLOW_16X16 &&
-             bsize >= BLOCK_SIZE_MB16X16) {
-    mbmi->txfm_size = TX_16X16;
-  } else if (cm->txfm_mode >= ALLOW_8X8 && (bsize >= BLOCK_SIZE_SB8X8)) {
-    mbmi->txfm_size = TX_8X8;
-  } else {
-    mbmi->txfm_size = TX_4X4;
-  }
-
-  // If reference frame is an Inter frame
   if (mbmi->ref_frame[0] != INTRA_FRAME) {
     int_mv nearest, nearby, best_mv;
     int_mv nearest_second, nearby_second, best_mv_second;
     vp9_prob *mv_ref_p;
+    MV_REFERENCE_FRAME ref0, ref1;
 
     read_ref_frame(pbi, r, mbmi->segment_id, mbmi->ref_frame);
+    ref0 = mbmi->ref_frame[0];
+    ref1 = mbmi->ref_frame[1];
 
-    {
-#ifdef DEC_DEBUG
-      if (dec_debug)
-        printf("%d %d\n", xd->mode_info_context->mbmi.mv[0].as_mv.row,
-               xd->mode_info_context->mbmi.mv[0].as_mv.col);
-#endif
-      vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
-                       mbmi->ref_frame[0], mbmi->ref_mvs[mbmi->ref_frame[0]],
-                       cm->ref_frame_sign_bias);
-
-      mv_ref_p = cm->fc.inter_mode_probs[
-        mbmi->mb_mode_context[mbmi->ref_frame[0]]];
-
-      // If the segment level skip mode enabled
-      if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
-        mbmi->mode = ZEROMV;
-      } else if (bsize >= BLOCK_SIZE_SB8X8) {
-        mbmi->mode = read_sb_mv_ref(r, mv_ref_p);
-        vp9_accum_mv_refs(cm, mbmi->mode,
-                          mbmi->mb_mode_context[mbmi->ref_frame[0]]);
-      }
+    vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
+                     ref0, mbmi->ref_mvs[ref0], cm->ref_frame_sign_bias);
 
-      if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
-        vp9_find_best_ref_mvs(xd,
-                              mbmi->ref_mvs[mbmi->ref_frame[0]],
-                              &nearest, &nearby);
+    mv_ref_p = cm->fc.inter_mode_probs[mbmi->mb_mode_context[ref0]];
 
-        best_mv.as_int = mbmi->ref_mvs[mbmi->ref_frame[0]][0].as_int;
-      }
+    if (vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      mbmi->mode = ZEROMV;
+    } else if (bsize >= BLOCK_SIZE_SB8X8) {
+      mbmi->mode = read_inter_mode(r, mv_ref_p);
+      vp9_accum_mv_refs(cm, mbmi->mode, mbmi->mb_mode_context[ref0]);
+    }
+    mbmi->uv_mode = DC_PRED;
 
-#ifdef DEC_DEBUG
-      if (dec_debug)
-        printf("[D %d %d] %d %d %d %d\n", ref_frame,
-               mbmi->mb_mode_context[ref_frame],
-               mv_ref_p[0], mv_ref_p[1], mv_ref_p[2], mv_ref_p[3]);
-#endif
+    // nearest, nearby
+    if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
+      vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref0], &nearest, &nearby);
+      best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int;
     }
 
     mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
                               ? read_switchable_filter_type(pbi, r)
                               : cm->mcomp_filter_type;
 
-    if (mbmi->ref_frame[1] > INTRA_FRAME) {
+    if (ref1 > INTRA_FRAME) {
       vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
-                       mbmi->ref_frame[1],
-                       mbmi->ref_mvs[mbmi->ref_frame[1]],
-                       cm->ref_frame_sign_bias);
+                       ref1, mbmi->ref_mvs[ref1], cm->ref_frame_sign_bias);
 
       if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
-        vp9_find_best_ref_mvs(xd,
-                              mbmi->ref_mvs[mbmi->ref_frame[1]],
-                              &nearest_second,
-                              &nearby_second);
-        best_mv_second.as_int = mbmi->ref_mvs[mbmi->ref_frame[1]][0].as_int;
+        vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref1],
+                              &nearest_second, &nearby_second);
+        best_mv_second.as_int = mbmi->ref_mvs[ref1][0].as_int;
       }
     }
 
-    mbmi->uv_mode = DC_PRED;
+
     if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
       for (idy = 0; idy < 2; idy += bh) {
         for (idx = 0; idx < 2; idx += bw) {
           int_mv blockmv, secondmv;
-          int blockmode;
-          int i;
-          j = idy * 2 + idx;
+          const int j = idy * 2 + idx;
+          const int blockmode = read_inter_mode(r, mv_ref_p);
 
-          blockmode = read_sb_mv_ref(r, mv_ref_p);
-          vp9_accum_mv_refs(cm, blockmode,
-                            mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+          vp9_accum_mv_refs(cm, blockmode, mbmi->mb_mode_context[ref0]);
           if (blockmode == NEARESTMV || blockmode == NEARMV) {
-            MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
             vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0);
-            if (rf2 > 0) {
+            if (ref1 > 0)
               vp9_append_sub8x8_mvs_for_idx(cm, xd,  &nearest_second,
                                             &nearby_second, j, 1);
-            }
           }
 
           switch (blockmode) {
             case NEWMV:
-              decode_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
-                         &cm->fc.NMVcount, xd->allow_high_precision_mv);
+              read_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
+                      &cm->counts.mv, xd->allow_high_precision_mv);
 
-              if (mbmi->ref_frame[1] > 0)
-                decode_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
-                          &cm->fc.NMVcount, xd->allow_high_precision_mv);
-
-#ifdef VPX_MODE_COUNT
-              vp9_mv_cont_count[mv_contz][3]++;
-#endif
+              if (ref1 > 0)
+                read_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
+                        &cm->counts.mv, xd->allow_high_precision_mv);
               break;
             case NEARESTMV:
               blockmv.as_int = nearest.as_int;
-              if (mbmi->ref_frame[1] > 0)
+              if (ref1 > 0)
                 secondmv.as_int = nearest_second.as_int;
-#ifdef VPX_MODE_COUNT
-              vp9_mv_cont_count[mv_contz][0]++;
-#endif
               break;
             case NEARMV:
               blockmv.as_int = nearby.as_int;
-              if (mbmi->ref_frame[1] > 0)
+              if (ref1 > 0)
                 secondmv.as_int = nearby_second.as_int;
-#ifdef VPX_MODE_COUNT
-              vp9_mv_cont_count[mv_contz][1]++;
-#endif
               break;
             case ZEROMV:
               blockmv.as_int = 0;
-              if (mbmi->ref_frame[1] > 0)
+              if (ref1 > 0)
                 secondmv.as_int = 0;
-#ifdef VPX_MODE_COUNT
-              vp9_mv_cont_count[mv_contz][2]++;
-#endif
               break;
             default:
-              break;
+              assert(!"Invalid inter mode value");
           }
           mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
-          if (mbmi->ref_frame[1] > 0)
+          if (ref1 > 0)
             mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
 
-          for (i = 1; i < bh; ++i)
-            vpx_memcpy(&mi->bmi[j + i * 2], &mi->bmi[j], sizeof(mi->bmi[j]));
-          for (i = 1; i < bw; ++i)
-            vpx_memcpy(&mi->bmi[j + i], &mi->bmi[j], sizeof(mi->bmi[j]));
+          if (bh == 2)
+            mi->bmi[j + 2] = mi->bmi[j];
+          if (bw == 2)
+            mi->bmi[j + 1] = mi->bmi[j];
           mi->mbmi.mode = blockmode;
         }
       }
@@ -697,6 +558,11 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
       mv0->as_int = mi->bmi[3].as_mv[0].as_int;
       mv1->as_int = mi->bmi[3].as_mv[1].as_int;
     } else {
+      const int mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+      const int mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+      const int mb_to_left_edge = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+      const int mb_to_right_edge = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
       switch (mbmi->mode) {
         case NEARMV:
           // Clip "next_nearest" so that it does not extend to far out of image
@@ -704,7 +570,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                                             mb_to_right_edge,
                                             mb_to_top_edge,
                                             mb_to_bottom_edge);
-          if (mbmi->ref_frame[1] > 0)
+          if (ref1 > 0)
             assign_and_clamp_mv(mv1, &nearby_second, mb_to_left_edge,
                                                      mb_to_right_edge,
                                                      mb_to_top_edge,
@@ -717,7 +583,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                                              mb_to_right_edge,
                                              mb_to_top_edge,
                                              mb_to_bottom_edge);
-          if (mbmi->ref_frame[1] > 0)
+          if (ref1 > 0)
             assign_and_clamp_mv(mv1, &nearest_second, mb_to_left_edge,
                                                       mb_to_right_edge,
                                                       mb_to_top_edge,
@@ -726,98 +592,109 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 
         case ZEROMV:
           mv0->as_int = 0;
-          if (mbmi->ref_frame[1] > 0)
+          if (ref1 > 0)
             mv1->as_int = 0;
           break;
 
         case NEWMV:
-          decode_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->fc.NMVcount,
-                    xd->allow_high_precision_mv);
-          if (mbmi->ref_frame[1] > 0)
-            decode_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc,
-                      &cm->fc.NMVcount, xd->allow_high_precision_mv);
+          read_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->counts.mv,
+                  xd->allow_high_precision_mv);
+          if (ref1 > 0)
+            read_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc,
+                    &cm->counts.mv, xd->allow_high_precision_mv);
           break;
         default:
-#if CONFIG_DEBUG
-          assert(0);
-#endif
-          break;
+          assert(!"Invalid inter mode value");
       }
     }
   } else {
-    // required for left and above block mv
-    mv0->as_int = 0;
-
-    if (bsize >= BLOCK_SIZE_SB8X8) {
-      const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
-      const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-      const int bsl = MIN(bwl, bhl);
-      mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[MIN(3, bsl)]);
-      cm->fc.y_mode_counts[MIN(3, bsl)][mbmi->mode]++;
-    } else {
-      int idx, idy;
-      for (idy = 0; idy < 2; idy += bh) {
-        for (idx = 0; idx < 2; idx += bw) {
-          int ib = idy * 2 + idx, k;
-          int m = read_intra_mode(r, cm->fc.y_mode_prob[0]);
-          mi->bmi[ib].as_mode.first = m;
-          cm->fc.y_mode_counts[0][m]++;
-          for (k = 1; k < bh; ++k)
-            mi->bmi[ib + k * 2].as_mode.first = m;
-          for (k = 1; k < bw; ++k)
-            mi->bmi[ib + k].as_mode.first = m;
-        }
-      }
-      mbmi->mode = mi->bmi[3].as_mode.first;
+    mv0->as_int = 0;  // required for left and above block mv
+    read_intra_block_modes(pbi, mi, r);
+  }
+}
+
+static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
+  int i;
+
+  cm->comp_pred_mode = cm->allow_comp_inter_inter ? read_comp_pred_mode(r)
+                                                  : SINGLE_PREDICTION_ONLY;
+
+  if (cm->comp_pred_mode == HYBRID_PREDICTION)
+    for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
+
+  if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
+    for (i = 0; i < REF_CONTEXTS; i++) {
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
     }
 
-    mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
-    cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
-  }
+  if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+    for (i = 0; i < REF_CONTEXTS; i++)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
 }
 
-void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r) {
-  VP9_COMMON *cm = &pbi->common;
+void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
   int k;
 
   // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.
   // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
-  for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
-    if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
-      cm->fc.mbskip_probs[k] =
-          vp9_read_prob_diff_update(r, cm->fc.mbskip_probs[k]);
-    }
-    // cm->fc.mbskip_probs[k] = vp9_read_prob(r);
-  }
+  for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+    if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+      vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]);
+
+  if (cm->frame_type != KEY_FRAME && !cm->intra_only) {
+    nmv_context *const nmvc = &pbi->common.fc.nmvc;
+    MACROBLOCKD *const xd = &pbi->mb;
+    int i, j;
+
+    read_inter_mode_probs(&cm->fc, r);
 
-  mb_mode_mv_init(pbi, r);
+    if (cm->mcomp_filter_type == SWITCHABLE)
+      read_switchable_interp_probs(&cm->fc, r);
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]);
+
+    read_comp_pred(cm, r);
+
+    for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
+      for (i = 0; i < VP9_INTRA_MODES - 1; ++i)
+        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+          vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]);
+
+    for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j)
+      for (i = 0; i < PARTITION_TYPES - 1; ++i)
+        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+          vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]);
+
+    read_mv_probs(r, nmvc, xd->allow_high_precision_mv);
+  }
 }
 
-void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
-                           MACROBLOCKD* const xd,
-                           int mi_row,
-                           int mi_col,
-                           vp9_reader *r) {
+void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
   MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  const int bw = 1 << mi_width_log2(bsize);
+  const int bh = 1 << mi_height_log2(bsize);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  int x, y;
 
-  if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
-    kfread_modes(pbi, mi, mi_row, mi_col, r);
-  } else {
-    read_mb_modes_mv(pbi, mi, &mi->mbmi, mi_row, mi_col, r);
-  }
+  if (cm->frame_type == KEY_FRAME || cm->intra_only)
+    read_intra_mode_info(pbi, mi, mi_row, mi_col, r);
+  else
+    read_inter_mode_info(pbi, mi, mi_row, mi_col, r);
 
-  if (1) {
-    const int bw = 1 << mi_width_log2(mbmi->sb_type);
-    const int bh = 1 << mi_height_log2(mbmi->sb_type);
-    const int y_mis = MIN(bh, cm->mi_rows - mi_row);
-    const int x_mis = MIN(bw, cm->mi_cols - mi_col);
-    const int mis = cm->mode_info_stride;
-    int x, y;
-
-    for (y = 0; y < y_mis; y++)
-      for (x = !y; x < x_mis; x++)
-        mi[y * mis + x] = *mi;
-  }
+  for (y = 0; y < y_mis; y++)
+    for (x = !y; x < x_mis; x++)
+      mi[y * cm->mode_info_stride + x] = *mi;
 }
diff --git a/libvpx/vp9/decoder/vp9_decodemv.h b/libvpx/vp9/decoder/vp9_decodemv.h
index bf5e83c..4073d9e 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.h
+++ b/libvpx/vp9/decoder/vp9_decodemv.h
@@ -13,11 +13,8 @@
 
 #include "vp9/decoder/vp9_onyxd_int.h"
 
-void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
-                           MACROBLOCKD* const xd,
-                           int mb_row,
-                           int mb_col,
-                           vp9_reader *r);
-void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r);
+void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r);
+
+void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r);
 
 #endif  // VP9_DECODER_VP9_DECODEMV_H_
diff --git a/libvpx/vp9/decoder/vp9_decodframe.c b/libvpx/vp9/decoder/vp9_decodframe.c
index 49b181d..ffec8ea 100644
--- a/libvpx/vp9/decoder/vp9_decodframe.c
+++ b/libvpx/vp9/decoder/vp9_decodframe.c
@@ -14,15 +14,15 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/vpx_scale.h"
 
-#include "vp9/common/vp9_extend.h"
-#include "vp9/common/vp9_modecont.h"
+#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_entropy.h"
-#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_extend.h"
+#include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_tile_common.h"
 
@@ -30,169 +30,58 @@
 #include "vp9/decoder/vp9_decodframe.h"
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_decodemv.h"
+#include "vp9/decoder/vp9_dsubexp.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 
-
-// #define DEC_DEBUG
-#ifdef DEC_DEBUG
-int dec_debug = 0;
-#endif
-
 static int read_be32(const uint8_t *p) {
   return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
 }
 
 // len == 0 is not allowed
-static int read_is_valid(const uint8_t *start, size_t len,
-                         const uint8_t *end) {
+static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
   return start + len > start && start + len <= end;
 }
 
-static void setup_txfm_mode(VP9_COMMON *pc, int lossless, vp9_reader *r) {
-  if (lossless) {
-    pc->txfm_mode = ONLY_4X4;
-  } else {
-    pc->txfm_mode = vp9_read_literal(r, 2);
-    if (pc->txfm_mode == ALLOW_32X32)
-      pc->txfm_mode += vp9_read_bit(r);
-    if (pc->txfm_mode == TX_MODE_SELECT) {
-      int i, j;
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) {
-          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-            pc->fc.tx_probs_8x8p[i][j] =
-                vp9_read_prob_diff_update(r, pc->fc.tx_probs_8x8p[i][j]);
-        }
-      }
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) {
-          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-            pc->fc.tx_probs_16x16p[i][j] =
-                vp9_read_prob_diff_update(r, pc->fc.tx_probs_16x16p[i][j]);
-        }
-      }
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) {
-          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-            pc->fc.tx_probs_32x32p[i][j] =
-                vp9_read_prob_diff_update(r, pc->fc.tx_probs_32x32p[i][j]);
-        }
-      }
-    }
-  }
-}
-
-static int get_unsigned_bits(unsigned int num_values) {
-  int cat = 0;
-  if (num_values <= 1)
-    return 0;
-  num_values--;
-  while (num_values > 0) {
-    cat++;
-    num_values >>= 1;
-  }
-  return cat;
-}
-
-static int inv_recenter_nonneg(int v, int m) {
-  if (v > 2 * m)
-    return v;
-
-  return v % 2 ? m - (v + 1) / 2 : m + v / 2;
-}
-
-static int decode_uniform(vp9_reader *r, int n) {
-  int v;
-  const int l = get_unsigned_bits(n);
-  const int m = (1 << l) - n;
-  if (!l)
-    return 0;
-
-  v = vp9_read_literal(r, l - 1);
-  return v < m ?  v : (v << 1) - m + vp9_read_bit(r);
-}
-
-static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {
-  int i = 0, mk = 0, word;
-  while (1) {
-    const int b = i ? k + i - 1 : k;
-    const int a = 1 << b;
-    if (num_syms <= mk + 3 * a) {
-      word = decode_uniform(r, num_syms - mk) + mk;
-      break;
-    } else {
-      if (vp9_read_bit(r)) {
-        i++;
-        mk += a;
-      } else {
-        word = vp9_read_literal(r, b) + mk;
-        break;
-      }
-    }
-  }
-  return word;
-}
-
 static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) {
   const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max));
   return data > max ? max : data;
 }
 
-static int merge_index(int v, int n, int modulus) {
-  int max1 = (n - 1 - modulus / 2) / modulus + 1;
-  if (v < max1) {
-    v = v * modulus + modulus / 2;
-  } else {
-    int w;
-    v -= max1;
-    w = v;
-    v += (v + modulus - modulus / 2) / modulus;
-    while (v % modulus == modulus / 2 ||
-           w != v - (v + modulus - modulus / 2) / modulus) v++;
-  }
-  return v;
-}
-
-static int inv_remap_prob(int v, int m) {
-  const int n = 255;
-
-  v = merge_index(v, n - 1, MODULUS_PARAM);
-  m--;
-  if ((m << 1) <= n) {
-    return 1 + inv_recenter_nonneg(v + 1, m);
-  } else {
-    return n - inv_recenter_nonneg(v + 1, n - 1 - m);
-  }
+static TX_MODE read_tx_mode(vp9_reader *r) {
+  TX_MODE tx_mode = vp9_read_literal(r, 2);
+  if (tx_mode == ALLOW_32X32)
+    tx_mode += vp9_read_bit(r);
+  return tx_mode;
 }
 
-vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp) {
-  int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
-  return (vp9_prob)inv_remap_prob(delp, oldp);
-}
+static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
+  int i, j;
 
-void vp9_init_dequantizer(VP9_COMMON *pc) {
-  int q;
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);
 
-  for (q = 0; q < QINDEX_RANGE; q++) {
-    // DC value
-    pc->y_dequant[q][0] = vp9_dc_quant(q, pc->y_dc_delta_q);
-    pc->uv_dequant[q][0] = vp9_dc_quant(q, pc->uv_dc_delta_q);
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);
 
-    // AC values
-    pc->y_dequant[q][1] = vp9_ac_quant(q, 0);
-    pc->uv_dequant[q][1] = vp9_ac_quant(q, pc->uv_ac_delta_q);
-  }
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
 }
 
-static void mb_init_dequantizer(VP9_COMMON *pc, MACROBLOCKD *xd) {
+static void init_dequantizer(VP9_COMMON *cm, MACROBLOCKD *xd) {
   int i;
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  xd->q_index = vp9_get_qindex(xd, segment_id, pc->base_qindex);
+  xd->q_index = vp9_get_qindex(xd, segment_id, cm->base_qindex);
 
-  xd->plane[0].dequant = pc->y_dequant[xd->q_index];
+  xd->plane[0].dequant = cm->y_dequant[xd->q_index];
   for (i = 1; i < MAX_MB_PLANE; i++)
-    xd->plane[i].dequant = pc->uv_dequant[xd->q_index];
+    xd->plane[i].dequant = cm->uv_dequant[xd->q_index];
 }
 
 static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -201,32 +90,32 @@ static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
   struct macroblockd_plane *pd = &xd->plane[plane];
   int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16);
   const int stride = pd->dst.stride;
+  const int eob = pd->eobs[block];
   const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
                                                        block, ss_txfrm_size);
   uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
                                                  raster_block,
                                                  pd->dst.buf, stride);
 
-  TX_TYPE tx_type;
-
   switch (ss_txfrm_size / 2) {
-    case TX_4X4:
-      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+    case TX_4X4: {
+      const TX_TYPE tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
       if (tx_type == DCT_DCT)
-        xd->itxm_add(qcoeff, dst, stride, pd->eobs[block]);
+        xd->itxm_add(qcoeff, dst, stride, eob);
       else
-        vp9_iht_add_c(tx_type, qcoeff, dst, stride, pd->eobs[block]);
+        vp9_iht_add_c(tx_type, qcoeff, dst, stride, eob);
       break;
+    }
     case TX_8X8:
-      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
-      vp9_iht_add_8x8_c(tx_type, qcoeff, dst, stride, pd->eobs[block]);
+      vp9_iht_add_8x8_c(get_tx_type_8x8(pd->plane_type, xd), qcoeff, dst,
+                        stride, eob);
       break;
     case TX_16X16:
-      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
-      vp9_iht_add_16x16_c(tx_type, qcoeff, dst, stride, pd->eobs[block]);
+      vp9_iht_add_16x16_c(get_tx_type_16x16(pd->plane_type, xd), qcoeff, dst,
+                          stride, eob);
       break;
     case TX_32X32:
-      vp9_idct_add_32x32(qcoeff, dst, stride, pd->eobs[block]);
+      vp9_idct_add_32x32(qcoeff, dst, stride, eob);
       break;
   }
 }
@@ -235,6 +124,7 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
                                int ss_txfrm_size, void *arg) {
   MACROBLOCKD* const xd = arg;
   struct macroblockd_plane *pd = &xd->plane[plane];
+  MODE_INFO *const mi = xd->mode_info_context;
 
   const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
                                                        block, ss_txfrm_size);
@@ -245,13 +135,12 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
   int b_mode;
   int plane_b_size;
   const int tx_ib = raster_block >> tx_size;
-  const int mode = plane == 0 ? xd->mode_info_context->mbmi.mode
-                              : xd->mode_info_context->mbmi.uv_mode;
+  const int mode = plane == 0 ? mi->mbmi.mode
+                              : mi->mbmi.uv_mode;
 
-
-  if (plane == 0 && xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+  if (plane == 0 && mi->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
     assert(bsize == BLOCK_SIZE_SB8X8);
-    b_mode = xd->mode_info_context->bmi[raster_block].as_mode.first;
+    b_mode = mi->bmi[raster_block].as_mode;
   } else {
     b_mode = mode;
   }
@@ -261,97 +150,28 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
 
   plane_b_size = b_width_log2(bsize) - pd->subsampling_x;
   vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
+                          dst, pd->dst.stride,
                           dst, pd->dst.stride);
 
   // Early exit if there are no coefficients
-  if (xd->mode_info_context->mbmi.mb_skip_coeff)
+  if (mi->mbmi.mb_skip_coeff)
     return;
 
   decode_block(plane, block, bsize, ss_txfrm_size, arg);
 }
 
-static void decode_atom(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                        int mi_row, int mi_col,
-                        vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-
-  assert(mbmi->ref_frame[0] != INTRA_FRAME);
-
-  if ((pbi->common.frame_type != KEY_FRAME) && (!pbi->common.intra_only))
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common);
-
-  // prediction
-  vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-
-  if (mbmi->mb_skip_coeff) {
-    vp9_reset_sb_tokens_context(xd, bsize);
-  } else {
-    if (xd->segmentation_enabled)
-      mb_init_dequantizer(&pbi->common, xd);
-
-    if (!vp9_reader_has_error(r))
-      vp9_decode_tokens(pbi, r, bsize);
-
-    foreach_transformed_block(xd, bsize, decode_block, xd);
-  }
-}
+static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize, vp9_reader *r) {
+  MACROBLOCKD *const xd = &pbi->mb;
 
-static void decode_sb_intra(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                          int mi_row, int mi_col,
-                          vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  if (mbmi->mb_skip_coeff) {
+  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
     vp9_reset_sb_tokens_context(xd, bsize);
+    return -1;
   } else {
-    if (xd->segmentation_enabled)
-      mb_init_dequantizer(&pbi->common, xd);
-
-    if (!vp9_reader_has_error(r))
-      vp9_decode_tokens(pbi, r, bsize);
-  }
+    if (xd->seg.enabled)
+      init_dequantizer(&pbi->common, xd);
 
-  foreach_transformed_block(xd, bsize, decode_block_intra, xd);
-}
-
-
-static void decode_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, int mi_row, int mi_col,
-                      vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mi_width_log2(bsize), bhl = mi_height_log2(bsize);
-  const int bw = 1 << bwl, bh = 1 << bhl;
-  int n, eobtotal;
-  VP9_COMMON *const pc = &pbi->common;
-  MODE_INFO *const mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const int mis = pc->mode_info_stride;
-
-  assert(mbmi->sb_type == bsize);
-  assert(mbmi->ref_frame[0] != INTRA_FRAME);
-
-  if (pbi->common.frame_type != KEY_FRAME)
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, pc);
-
-  // generate prediction
-  vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-
-  if (mbmi->mb_skip_coeff) {
-    vp9_reset_sb_tokens_context(xd, bsize);
-  } else {
-    // re-initialize macroblock dequantizer before detokenization
-    if (xd->segmentation_enabled)
-      mb_init_dequantizer(pc, xd);
-
-    // dequantization and idct
-    eobtotal = vp9_decode_tokens(pbi, r, bsize);
-    if (eobtotal == 0) {  // skip loopfilter
-      for (n = 0; n < bw * bh; n++) {
-        const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
-        if (mi_col + x_idx < pc->mi_cols && mi_row + y_idx < pc->mi_rows)
-          mi[y_idx * mis + x_idx].mbmi.mb_skip_coeff = 1;
-      }
-    } else {
-      foreach_transformed_block(xd, bsize, decode_block, xd);
-    }
+    // TODO(dkovalev) if (!vp9_reader_has_error(r))
+    return vp9_decode_tokens(pbi, r, bsize);
   }
 }
 
@@ -377,8 +197,8 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize,
     pd->left_context = cm->left_context[i] +
                            (((mi_row * 2) & 15) >> pd->subsampling_y);
   }
-  xd->above_seg_context = cm->above_seg_context + mi_col;
-  xd->left_seg_context  = cm->left_seg_context + (mi_row & MI_MASK);
+
+  set_partition_seg_context(cm, xd, mi_row, mi_col);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
@@ -387,53 +207,65 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize,
   setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col);
 }
 
-static void set_refs(VP9D_COMP *pbi, int mi_row, int mi_col) {
+static void set_ref(VP9D_COMP *pbi, int i, int mi_row, int mi_col) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  const int ref = mbmi->ref_frame[i] - 1;
 
-  if (mbmi->ref_frame[0] > INTRA_FRAME) {
-    // Select the appropriate reference frame for this MB
-    const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1];
-    const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx];
-    xd->scale_factor[0]    = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
-    xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
-    setup_pre_planes(xd, cfg, NULL, mi_row, mi_col,
-                     xd->scale_factor, xd->scale_factor_uv);
-    xd->corrupted |= cfg->corrupted;
-
-    if (mbmi->ref_frame[1] > INTRA_FRAME) {
-      // Select the appropriate reference frame for this MB
-      const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1];
-      const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx];
-      xd->scale_factor[1]    = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
-      xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
-      setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col,
-                       xd->scale_factor, xd->scale_factor_uv);
-      xd->corrupted |= second_cfg->corrupted;
-    }
-  }
+  const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[ref]];
+  xd->scale_factor[i] = cm->active_ref_scale[ref];
+  setup_pre_planes(xd, i, cfg, mi_row, mi_col, &xd->scale_factor[i]);
+  xd->corrupted |= cfg->corrupted;
 }
 
 static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
                            vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
+  const int less8x8 = bsize < BLOCK_SIZE_SB8X8;
+  MB_MODE_INFO *mbmi;
 
-  if (bsize < BLOCK_SIZE_SB8X8)
+  if (less8x8)
     if (xd->ab_index > 0)
       return;
+
   set_offsets(pbi, bsize, mi_row, mi_col);
-  vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);
-  set_refs(pbi, mi_row, mi_col);
+  vp9_read_mode_info(pbi, mi_row, mi_col, r);
 
-  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
-    decode_sb_intra(pbi, xd, mi_row, mi_col, r, (bsize < BLOCK_SIZE_SB8X8) ?
-                                     BLOCK_SIZE_SB8X8 : bsize);
-  else if (bsize < BLOCK_SIZE_SB8X8)
-    decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
-  else
-    decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
+  if (less8x8)
+    bsize = BLOCK_SIZE_SB8X8;
 
+  // Has to be called after set_offsets
+  mbmi = &xd->mode_info_context->mbmi;
+
+  if (mbmi->ref_frame[0] == INTRA_FRAME) {
+    // Intra reconstruction
+    decode_tokens(pbi, bsize, r);
+    foreach_transformed_block(xd, bsize, decode_block_intra, xd);
+  } else {
+    // Inter reconstruction
+    int eobtotal;
+
+    set_ref(pbi, 0, mi_row, mi_col);
+    if (mbmi->ref_frame[1] > INTRA_FRAME)
+      set_ref(pbi, 1, mi_row, mi_col);
+
+    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    eobtotal = decode_tokens(pbi, bsize, r);
+    if (less8x8) {
+      if (eobtotal >= 0)
+        foreach_transformed_block(xd, bsize, decode_block, xd);
+    } else {
+      assert(mbmi->sb_type == bsize);
+      if (eobtotal == 0)
+        // skip loopfilter
+        vp9_set_pred_flag_mbskip(cm, bsize, mi_row, mi_col, 1);
+      else if (eobtotal > 0)
+        foreach_transformed_block(xd, bsize, decode_block, xd);
+    }
+  }
   xd->corrupted |= vp9_reader_has_error(r);
 }
 
@@ -448,16 +280,13 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
   if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols)
     return;
 
-  if (bsize < BLOCK_SIZE_SB8X8)
+  if (bsize < BLOCK_SIZE_SB8X8) {
     if (xd->ab_index != 0)
       return;
-
-  if (bsize >= BLOCK_SIZE_SB8X8) {
+  } else {
     int pl;
-    int idx = check_bsize_coverage(pc, xd, mi_row, mi_col, bsize);
-    // read the partition information
-    xd->left_seg_context = pc->left_seg_context + (mi_row & MI_MASK);
-    xd->above_seg_context = pc->above_seg_context + mi_col;
+    const int idx = check_bsize_coverage(pc, xd, mi_row, mi_col, bsize);
+    set_partition_seg_context(pc, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
 
     if (idx == 0)
@@ -469,7 +298,7 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
     else
       partition = PARTITION_SPLIT;
 
-    pc->fc.partition_counts[pl][partition]++;
+    pc->counts.partition[pl][partition]++;
   }
 
   subsize = get_subsize(bsize, partition);
@@ -499,8 +328,9 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
       }
       break;
     default:
-      assert(0);
+      assert(!"Invalid partition type");
   }
+
   // update partition context
   if (bsize >= BLOCK_SIZE_SB8X8 &&
       (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
@@ -527,142 +357,118 @@ static void setup_token_decoder(VP9D_COMP *pbi,
                        "Failed to allocate bool decoder %d", 1);
 }
 
-static void read_coef_probs_common(FRAME_CONTEXT *fc, TX_SIZE tx_size,
+static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs,
                                    vp9_reader *r) {
-  vp9_coeff_probs_model *coef_probs = fc->coef_probs[tx_size];
-
-  if (vp9_read_bit(r)) {
-    int i, j, k, l, m;
-    for (i = 0; i < BLOCK_TYPES; i++) {
-      for (j = 0; j < REF_TYPES; j++) {
-        for (k = 0; k < COEF_BANDS; k++) {
-          for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
-            if (l >= 3 && k == 0)
-              continue;
-
-            for (m = 0; m < UNCONSTRAINED_NODES; m++) {
-              vp9_prob *const p = coef_probs[i][j][k][l] + m;
-
-              if (vp9_read(r, VP9_COEF_UPDATE_PROB))
-                *p = vp9_read_prob_diff_update(r, *p);
-            }
-          }
-        }
-      }
-    }
-  }
-}
+  int i, j, k, l, m;
 
-static void read_coef_probs(VP9D_COMP *pbi, vp9_reader *r) {
-  const TXFM_MODE txfm_mode = pbi->common.txfm_mode;
-  FRAME_CONTEXT *const fc = &pbi->common.fc;
+  if (vp9_read_bit(r))
+    for (i = 0; i < BLOCK_TYPES; i++)
+      for (j = 0; j < REF_TYPES; j++)
+        for (k = 0; k < COEF_BANDS; k++)
+          for (l = 0; l < PREV_COEF_CONTEXTS; l++)
+            if (k > 0 || l < 3)
+              for (m = 0; m < UNCONSTRAINED_NODES; m++)
+                if (vp9_read(r, VP9_COEF_UPDATE_PROB))
+                  vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
+}
 
-  read_coef_probs_common(fc, TX_4X4, r);
+static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
+                            vp9_reader *r) {
+  read_coef_probs_common(fc->coef_probs[TX_4X4], r);
 
-  if (txfm_mode > ONLY_4X4)
-    read_coef_probs_common(fc, TX_8X8, r);
+  if (tx_mode > ONLY_4X4)
+    read_coef_probs_common(fc->coef_probs[TX_8X8], r);
 
-  if (txfm_mode > ALLOW_8X8)
-    read_coef_probs_common(fc, TX_16X16, r);
+  if (tx_mode > ALLOW_8X8)
+    read_coef_probs_common(fc->coef_probs[TX_16X16], r);
 
-  if (txfm_mode > ALLOW_16X16)
-    read_coef_probs_common(fc, TX_32X32, r);
+  if (tx_mode > ALLOW_16X16)
+    read_coef_probs_common(fc->coef_probs[TX_32X32], r);
 }
 
-static void setup_segmentation(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
+static void setup_segmentation(struct segmentation *seg,
+                               struct vp9_read_bit_buffer *rb) {
   int i, j;
 
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
-
-  xd->update_mb_segmentation_map = 0;
-  xd->update_mb_segmentation_data = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
 
-  xd->segmentation_enabled = vp9_rb_read_bit(rb);
-  if (!xd->segmentation_enabled)
+  seg->enabled = vp9_rb_read_bit(rb);
+  if (!seg->enabled)
     return;
 
   // Segmentation map update
-  xd->update_mb_segmentation_map = vp9_rb_read_bit(rb);
-  if (xd->update_mb_segmentation_map) {
-    for (i = 0; i < MB_SEG_TREE_PROBS; i++)
-      xd->mb_segment_tree_probs[i] = vp9_rb_read_bit(rb) ?
-                                         vp9_rb_read_literal(rb, 8) : MAX_PROB;
-
-    cm->temporal_update = vp9_rb_read_bit(rb);
-    if (cm->temporal_update) {
+  seg->update_map = vp9_rb_read_bit(rb);
+  if (seg->update_map) {
+    for (i = 0; i < SEG_TREE_PROBS; i++)
+      seg->tree_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8)
+                                               : MAX_PROB;
+
+    seg->temporal_update = vp9_rb_read_bit(rb);
+    if (seg->temporal_update) {
       for (i = 0; i < PREDICTION_PROBS; i++)
-        cm->segment_pred_probs[i] = vp9_rb_read_bit(rb) ?
-                                        vp9_rb_read_literal(rb, 8) : MAX_PROB;
+        seg->pred_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8)
+                                                 : MAX_PROB;
     } else {
       for (i = 0; i < PREDICTION_PROBS; i++)
-        cm->segment_pred_probs[i] = MAX_PROB;
+        seg->pred_probs[i] = MAX_PROB;
     }
   }
 
   // Segmentation data update
-  xd->update_mb_segmentation_data = vp9_rb_read_bit(rb);
-  if (xd->update_mb_segmentation_data) {
-    xd->mb_segment_abs_delta = vp9_rb_read_bit(rb);
+  seg->update_data = vp9_rb_read_bit(rb);
+  if (seg->update_data) {
+    seg->abs_delta = vp9_rb_read_bit(rb);
 
-    vp9_clearall_segfeatures(xd);
+    vp9_clearall_segfeatures(seg);
 
-    for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+    for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
         int data = 0;
         const int feature_enabled = vp9_rb_read_bit(rb);
         if (feature_enabled) {
-          vp9_enable_segfeature(xd, i, j);
+          vp9_enable_segfeature(seg, i, j);
           data = decode_unsigned_max(rb, vp9_seg_feature_data_max(j));
           if (vp9_is_segfeature_signed(j))
             data = vp9_rb_read_bit(rb) ? -data : data;
         }
-        vp9_set_segdata(xd, i, j, data);
+        vp9_set_segdata(seg, i, j, data);
       }
     }
   }
 }
 
-static void setup_loopfilter(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
+static void setup_loopfilter(struct loopfilter *lf,
+                             struct vp9_read_bit_buffer *rb) {
 
-  cm->filter_level = vp9_rb_read_literal(rb, 6);
-  cm->sharpness_level = vp9_rb_read_literal(rb, 3);
+  lf->filter_level = vp9_rb_read_literal(rb, 6);
+  lf->sharpness_level = vp9_rb_read_literal(rb, 3);
 
   // Read in loop filter deltas applied at the MB level based on mode or ref
   // frame.
-  xd->mode_ref_lf_delta_update = 0;
+  lf->mode_ref_delta_update = 0;
 
-  xd->mode_ref_lf_delta_enabled = vp9_rb_read_bit(rb);
-  if (xd->mode_ref_lf_delta_enabled) {
-    xd->mode_ref_lf_delta_update = vp9_rb_read_bit(rb);
-    if (xd->mode_ref_lf_delta_update) {
+  lf->mode_ref_delta_enabled = vp9_rb_read_bit(rb);
+  if (lf->mode_ref_delta_enabled) {
+    lf->mode_ref_delta_update = vp9_rb_read_bit(rb);
+    if (lf->mode_ref_delta_update) {
       int i;
 
-      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
-        if (vp9_rb_read_bit(rb)) {
-          const int value = vp9_rb_read_literal(rb, 6);
-          xd->ref_lf_deltas[i] = vp9_rb_read_bit(rb) ? -value : value;
-        }
-      }
+      for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+        if (vp9_rb_read_bit(rb))
+          lf->ref_deltas[i] = vp9_rb_read_signed_literal(rb, 6);
 
-      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
-        if (vp9_rb_read_bit(rb)) {
-          const int value = vp9_rb_read_literal(rb, 6);
-          xd->mode_lf_deltas[i] = vp9_rb_read_bit(rb) ? -value : value;
-        }
-      }
+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+        if (vp9_rb_read_bit(rb))
+          lf->mode_deltas[i] = vp9_rb_read_signed_literal(rb, 6);
     }
   }
 }
 
 static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
   const int old = *delta_q;
-  if (vp9_rb_read_bit(rb)) {
-    const int value = vp9_rb_read_literal(rb, 4);
-    *delta_q = vp9_rb_read_bit(rb) ? -value : value;
-  }
+  if (vp9_rb_read_bit(rb))
+    *delta_q = vp9_rb_read_signed_literal(rb, 4);
   return old != *delta_q;
 }
 
@@ -682,11 +488,9 @@ static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
                  cm->y_dc_delta_q == 0 &&
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
-  if (xd->lossless) {
-    xd->itxm_add          = vp9_idct_add_lossless_c;
-  } else {
-    xd->itxm_add          = vp9_idct_add;
-  }
+
+  xd->itxm_add = xd->lossless ? vp9_idct_add_lossless_c
+                              : vp9_idct_add;
 }
 
 static INTERPOLATIONFILTERTYPE read_interp_filter_type(
@@ -778,108 +582,90 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
   apply_frame_size(pbi, width, height);
 }
 
-static void update_frame_context(FRAME_CONTEXT *fc) {
-  vp9_copy(fc->pre_coef_probs, fc->coef_probs);
-  vp9_copy(fc->pre_y_mode_prob, fc->y_mode_prob);
-  vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob);
-  vp9_copy(fc->pre_partition_prob, fc->partition_prob[1]);
-  vp9_copy(fc->pre_intra_inter_prob, fc->intra_inter_prob);
-  vp9_copy(fc->pre_comp_inter_prob, fc->comp_inter_prob);
-  vp9_copy(fc->pre_single_ref_prob, fc->single_ref_prob);
-  vp9_copy(fc->pre_comp_ref_prob, fc->comp_ref_prob);
-  fc->pre_nmvc = fc->nmvc;
-  vp9_copy(fc->pre_switchable_interp_prob, fc->switchable_interp_prob);
-  vp9_copy(fc->pre_inter_mode_probs, fc->inter_mode_probs);
-  vp9_copy(fc->pre_tx_probs_8x8p, fc->tx_probs_8x8p);
-  vp9_copy(fc->pre_tx_probs_16x16p, fc->tx_probs_16x16p);
-  vp9_copy(fc->pre_tx_probs_32x32p, fc->tx_probs_32x32p);
-  vp9_copy(fc->pre_mbskip_probs, fc->mbskip_probs);
-
-  vp9_zero(fc->coef_counts);
-  vp9_zero(fc->eob_branch_counts);
-  vp9_zero(fc->y_mode_counts);
-  vp9_zero(fc->uv_mode_counts);
-  vp9_zero(fc->NMVcount);
-  vp9_zero(fc->inter_mode_counts);
-  vp9_zero(fc->partition_counts);
-  vp9_zero(fc->switchable_interp_count);
-  vp9_zero(fc->intra_inter_count);
-  vp9_zero(fc->comp_inter_count);
-  vp9_zero(fc->single_ref_count);
-  vp9_zero(fc->comp_ref_count);
-  vp9_zero(fc->tx_count_8x8p);
-  vp9_zero(fc->tx_count_16x16p);
-  vp9_zero(fc->tx_count_32x32p);
-  vp9_zero(fc->mbskip_count);
-}
-
 static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
   VP9_COMMON *const pc = &pbi->common;
   int mi_row, mi_col;
 
-  for (mi_row = pc->cur_tile_mi_row_start;
-       mi_row < pc->cur_tile_mi_row_end; mi_row += 64 / MI_SIZE) {
+  if (pbi->do_loopfilter_inline) {
+    vp9_loop_filter_frame_init(pc, &pbi->mb, pbi->mb.lf.filter_level);
+  }
+
+  for (mi_row = pc->cur_tile_mi_row_start; mi_row < pc->cur_tile_mi_row_end;
+       mi_row += MI_BLOCK_SIZE) {
     // For a SB there are 2 left contexts, each pertaining to a MB row within
     vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
     vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));
-    for (mi_col = pc->cur_tile_mi_col_start;
-         mi_col < pc->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE)
+    for (mi_col = pc->cur_tile_mi_col_start; mi_col < pc->cur_tile_mi_col_end;
+         mi_col += MI_BLOCK_SIZE) {
       decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64);
+    }
+
+    if (pbi->do_loopfilter_inline) {
+      YV12_BUFFER_CONFIG *const fb =
+          &pbi->common.yv12_fb[pbi->common.new_fb_idx];
+      // delay the loopfilter by 1 macroblock row.
+      const int lf_start = mi_row - MI_BLOCK_SIZE;
+      if (lf_start < 0) continue;
+      vp9_loop_filter_rows(fb, pc, &pbi->mb, lf_start, mi_row, 0);
+    }
+  }
+
+  if (pbi->do_loopfilter_inline) {
+    YV12_BUFFER_CONFIG *const fb = &pbi->common.yv12_fb[pbi->common.new_fb_idx];
+    vp9_loop_filter_rows(fb, pc, &pbi->mb,
+                         mi_row - MI_BLOCK_SIZE, pc->mi_rows, 0);
   }
 }
 
 static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
-  int delta_log2_tiles;
+  int min_log2_tile_cols, max_log2_tile_cols, max_ones;
+  vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
-  vp9_get_tile_n_bits(cm, &cm->log2_tile_columns, &delta_log2_tiles);
-  while (delta_log2_tiles--) {
-    if (vp9_rb_read_bit(rb)) {
-      cm->log2_tile_columns++;
-    } else {
-      break;
-    }
-  }
+  // columns
+  max_ones = max_log2_tile_cols - min_log2_tile_cols;
+  cm->log2_tile_cols = min_log2_tile_cols;
+  while (max_ones-- && vp9_rb_read_bit(rb))
+    cm->log2_tile_cols++;
 
+  // rows
   cm->log2_tile_rows = vp9_rb_read_bit(rb);
   if (cm->log2_tile_rows)
     cm->log2_tile_rows += vp9_rb_read_bit(rb);
-
-  cm->tile_columns = 1 << cm->log2_tile_columns;
-  cm->tile_rows    = 1 << cm->log2_tile_rows;
 }
 
-static void decode_tiles(VP9D_COMP *pbi,
-                         const uint8_t *data, size_t first_partition_size,
-                         vp9_reader *residual_bc) {
+static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
+  vp9_reader residual_bc;
+
   VP9_COMMON *const pc = &pbi->common;
 
-  const uint8_t *data_ptr = data + first_partition_size;
-  const uint8_t* const data_end = pbi->source + pbi->source_sz;
+  const uint8_t *const data_end = pbi->source + pbi->source_sz;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(pc->mi_cols);
+  const int tile_cols = 1 << pc->log2_tile_cols;
+  const int tile_rows = 1 << pc->log2_tile_rows;
   int tile_row, tile_col;
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(pc->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *
-                                      MAX_MB_PLANE * mi_cols_aligned_to_sb(pc));
+  vpx_memset(pc->above_context[0], 0,
+             sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols);
 
-  vpx_memset(pc->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
-                                       mi_cols_aligned_to_sb(pc));
+  vpx_memset(pc->above_seg_context, 0,
+             sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
 
   if (pbi->oxcf.inv_tile_order) {
-    const int n_cols = pc->tile_columns;
     const uint8_t *data_ptr2[4][1 << 6];
     vp9_reader bc_bak = {0};
 
     // pre-initialize the offsets, we're going to read in inverse order
-    data_ptr2[0][0] = data_ptr;
-    for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+    data_ptr2[0][0] = data;
+    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
       if (tile_row) {
-        const int size = read_be32(data_ptr2[tile_row - 1][n_cols - 1]);
-        data_ptr2[tile_row - 1][n_cols - 1] += 4;
-        data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][n_cols - 1] + size;
+        const int size = read_be32(data_ptr2[tile_row - 1][tile_cols - 1]);
+        data_ptr2[tile_row - 1][tile_cols - 1] += 4;
+        data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][tile_cols - 1] + size;
       }
 
-      for (tile_col = 1; tile_col < n_cols; tile_col++) {
+      for (tile_col = 1; tile_col < tile_cols; tile_col++) {
         const int size = read_be32(data_ptr2[tile_row][tile_col - 1]);
         data_ptr2[tile_row][tile_col - 1] += 4;
         data_ptr2[tile_row][tile_col] =
@@ -887,48 +673,49 @@ static void decode_tiles(VP9D_COMP *pbi,
       }
     }
 
-    for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
       vp9_get_tile_row_offsets(pc, tile_row);
-      for (tile_col = n_cols - 1; tile_col >= 0; tile_col--) {
+      for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) {
         vp9_get_tile_col_offsets(pc, tile_col);
         setup_token_decoder(pbi, data_ptr2[tile_row][tile_col],
                             data_end - data_ptr2[tile_row][tile_col],
-                            residual_bc);
-        decode_tile(pbi, residual_bc);
-        if (tile_row == pc->tile_rows - 1 && tile_col == n_cols - 1)
-          bc_bak = *residual_bc;
+                            &residual_bc);
+        decode_tile(pbi, &residual_bc);
+        if (tile_row == tile_rows - 1 && tile_col == tile_cols - 1)
+          bc_bak = residual_bc;
       }
     }
-    *residual_bc = bc_bak;
+    residual_bc = bc_bak;
   } else {
     int has_more;
 
-    for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
       vp9_get_tile_row_offsets(pc, tile_row);
-      for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) {
+      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
         size_t size;
 
         vp9_get_tile_col_offsets(pc, tile_col);
 
-        has_more = tile_col < pc->tile_columns - 1 ||
-                   tile_row < pc->tile_rows - 1;
+        has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1;
         if (has_more) {
-          if (!read_is_valid(data_ptr, 4, data_end))
+          if (!read_is_valid(data, 4, data_end))
             vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                          "Truncated packet or corrupt tile length");
 
-          size = read_be32(data_ptr);
-          data_ptr += 4;
+          size = read_be32(data);
+          data += 4;
         } else {
-          size = data_end - data_ptr;
+          size = data_end - data;
         }
 
-        setup_token_decoder(pbi, data_ptr, size, residual_bc);
-        decode_tile(pbi, residual_bc);
-        data_ptr += size;
+        setup_token_decoder(pbi, data, size, &residual_bc);
+        decode_tile(pbi, &residual_bc);
+        data += size;
       }
     }
   }
+
+  return vp9_reader_find_end(&residual_bc);
 }
 
 static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -949,10 +736,9 @@ static void setup_inter_inter(VP9_COMMON *cm) {
   int i;
 
   cm->allow_comp_inter_inter = 0;
-  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
-    cm->allow_comp_inter_inter |= i > 0 &&
+  for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i)
+    cm->allow_comp_inter_inter |=
         cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1];
-  }
 
   if (cm->allow_comp_inter_inter) {
     // which one is always-on in comp inter-inter?
@@ -999,7 +785,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
     int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
     ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->new_fb_idx, frame_to_show);
     pbi->refresh_frame_flags = 0;
-    cm->filter_level = 0;
+    xd->lf.filter_level = 0;
     return 0;
   }
 
@@ -1053,7 +839,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
        pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES);
 
       for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
-        const int ref = vp9_rb_read_literal(rb, NUM_REF_FRAMES_LG2);
+        const int ref = vp9_rb_read_literal(rb, NUM_REF_FRAMES_LOG2);
         cm->active_ref_idx[i] = cm->ref_frame_map[ref];
         cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
       }
@@ -1078,23 +864,54 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
     cm->frame_parallel_decoding_mode = 1;
   }
 
-  cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LG2);
+  cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LOG2);
 
   if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->intra_only)
     vp9_setup_past_independence(cm, xd);
 
-  setup_loopfilter(pbi, rb);
+  setup_loopfilter(&xd->lf, rb);
   setup_quantization(pbi, rb);
-  setup_segmentation(pbi, rb);
+  setup_segmentation(&xd->seg, rb);
 
   setup_tile_info(cm, rb);
 
   return vp9_rb_read_literal(rb, 16);
 }
 
+static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
+                                  size_t partition_size) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  vp9_reader r;
+
+  if (vp9_reader_init(&r, data, partition_size))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate bool decoder 0");
+
+  cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r);
+  if (cm->tx_mode == TX_MODE_SELECT)
+    read_tx_probs(&cm->fc.tx_probs, &r);
+  read_coef_probs(&cm->fc, cm->tx_mode, &r);
+
+  vp9_prepare_read_mode_info(pbi, &r);
+
+  return vp9_reader_has_error(&r);
+}
+
+void vp9_init_dequantizer(VP9_COMMON *cm) {
+  int q;
+
+  for (q = 0; q < QINDEX_RANGE; q++) {
+    cm->y_dequant[q][0] = vp9_dc_quant(q, cm->y_dc_delta_q);
+    cm->y_dequant[q][1] = vp9_ac_quant(q, 0);
+
+    cm->uv_dequant[q][0] = vp9_dc_quant(q, cm->uv_dc_delta_q);
+    cm->uv_dequant[q][1] = vp9_ac_quant(q, cm->uv_ac_delta_q);
+  }
+}
+
 int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   int i;
-  vp9_reader header_bc, residual_bc;
   VP9_COMMON *const pc = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
 
@@ -1115,6 +932,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   data += vp9_rb_bytes_read(&rb);
   xd->corrupted = 0;
   new_fb->corrupted = 0;
+  pbi->do_loopfilter_inline =
+      (pc->log2_tile_rows | pc->log2_tile_cols) == 0 && pbi->mb.lf.filter_level;
 
   if (!pbi->decoded_key_frame && !keyframe)
     return -1;
@@ -1125,37 +944,29 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
 
   xd->mode_info_context = pc->mi;
   xd->prev_mode_info_context = pc->prev_mi;
-  xd->frame_type = pc->frame_type;
   xd->mode_info_stride = pc->mode_info_stride;
 
-  if (vp9_reader_init(&header_bc, data, first_partition_size))
-    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate bool decoder 0");
-
-  mb_init_dequantizer(pc, &pbi->mb);  // MB level dequantizer setup
+  init_dequantizer(pc, &pbi->mb);
 
   if (!keyframe)
     vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
 
   pc->fc = pc->frame_contexts[pc->frame_context_idx];
 
-  update_frame_context(&pc->fc);
-
-  setup_txfm_mode(pc, xd->lossless, &header_bc);
-
-  read_coef_probs(pbi, &header_bc);
+  vp9_zero(pc->counts);
 
   // Initialize xd pointers. Any reference should do for xd->pre, so use 0.
-  setup_pre_planes(xd, &pc->yv12_fb[pc->active_ref_idx[0]], NULL,
-                   0, 0, NULL, NULL);
+  setup_pre_planes(xd, 0, &pc->yv12_fb[pc->active_ref_idx[0]], 0, 0, NULL);
   setup_dst_planes(xd, new_fb, 0, 0);
 
+  new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size);
+
   // Create the segmentation map structure and set to 0
   if (!pc->last_frame_seg_map)
-    CHECK_MEM_ERROR(pc->last_frame_seg_map,
+    CHECK_MEM_ERROR(pc, pc->last_frame_seg_map,
                     vpx_calloc((pc->mi_rows * pc->mi_cols), 1));
 
-  vp9_setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y);
+  setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y);
 
   // clear out the coeff buffer
   for (i = 0; i < MAX_MB_PLANE; ++i)
@@ -1163,14 +974,12 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
 
   set_prev_mi(pc);
 
-  vp9_decode_mode_mvs_init(pbi, &header_bc);
-
-  decode_tiles(pbi, data, first_partition_size, &residual_bc);
+  *p_data_end = decode_tiles(pbi, data + first_partition_size);
 
   pc->last_width = pc->width;
   pc->last_height = pc->height;
 
-  new_fb->corrupted = vp9_reader_has_error(&header_bc) | xd->corrupted;
+  new_fb->corrupted |= xd->corrupted;
 
   if (!pbi->decoded_key_frame) {
     if (keyframe && !new_fb->corrupted)
@@ -1180,20 +989,18 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
                          "A stream must start with a complete key frame");
   }
 
-  // Adaptation
   if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(pc);
 
-    if ((!keyframe) && (!pc->intra_only)) {
+    if (!keyframe && !pc->intra_only) {
       vp9_adapt_mode_probs(pc);
       vp9_adapt_mode_context(pc);
-      vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
+      vp9_adapt_mv_probs(pc, xd->allow_high_precision_mv);
     }
   }
 
   if (pc->refresh_frame_context)
     pc->frame_contexts[pc->frame_context_idx] = pc->fc;
 
-  *p_data_end = vp9_reader_find_end(&residual_bc);
   return 0;
 }
diff --git a/libvpx/vp9/decoder/vp9_decodframe.h b/libvpx/vp9/decoder/vp9_decodframe.h
index 66e951d..00b6d67 100644
--- a/libvpx/vp9/decoder/vp9_decodframe.h
+++ b/libvpx/vp9/decoder/vp9_decodframe.h
@@ -17,6 +17,5 @@ struct VP9Decompressor;
 
 void vp9_init_dequantizer(struct VP9Common *pc);
 int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);
-vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp);
 
 #endif  // VP9_DECODER_VP9_DECODFRAME_H_
diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c
index 3bbb212..01c1db0 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/libvpx/vp9/decoder/vp9_detokenize.c
@@ -18,14 +18,8 @@
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 
-#if CONFIG_BALANCED_COEFTREE
-#define ZERO_CONTEXT_NODE           0
-#define EOB_CONTEXT_NODE            1
-#else
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
-#endif
-
 #define ONE_CONTEXT_NODE            2
 #define LOW_VAL_CONTEXT_NODE        3
 #define TWO_CONTEXT_NODE            4
@@ -91,13 +85,15 @@ DECLARE_ALIGNED(16, extern const uint8_t,
       val += 1 << bits_count;          \
   } while (0);
 
-static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
+static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
                         vp9_reader *r, int block_idx,
                         PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr,
                         TX_SIZE txfm_size, const int16_t *dq,
                         ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
+  FRAME_CONTEXT *const fc = &cm->fc;
+  FRAME_COUNTS *const counts = &cm->counts;
   ENTROPY_CONTEXT above_ec, left_ec;
-  int pt, c = 0, pad, default_eob;
+  int pt, c = 0;
   int band;
   vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES];
   vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
@@ -113,53 +109,31 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
   vp9_prob *prob;
   vp9_coeff_count_model *coef_counts;
   const int ref = xd->mode_info_context->mbmi.ref_frame[0] != INTRA_FRAME;
-  TX_TYPE tx_type = DCT_DCT;
-  const int *scan, *nb;
+  const int16_t *scan, *nb;
   uint8_t token_cache[1024];
   const uint8_t * band_translate;
-#if CONFIG_BALANCED_COEFTREE
-  int skip_eob_node = 0;
-#endif
-
   coef_probs  = fc->coef_probs[txfm_size][type][ref];
-  coef_counts = fc->coef_counts[txfm_size];
+  coef_counts = counts->coef[txfm_size];
   switch (txfm_size) {
     default:
     case TX_4X4: {
-      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_4x4(xd, block_idx) : DCT_DCT;
-      scan = get_scan_4x4(tx_type);
+      scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
       above_ec = A[0] != 0;
       left_ec = L[0] != 0;
-      default_eob = 16;
       band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
-      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 1 + b_width_log2(sb_type);
-      const int x = block_idx & ((1 << sz) - 1);
-      const int y = block_idx - x;
-      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
-      scan = get_scan_8x8(tx_type);
+      scan = get_scan_8x8(get_tx_type_8x8(type, xd));
       above_ec = (A[0] + A[1]) != 0;
       left_ec = (L[0] + L[1]) != 0;
-      default_eob = 64;
       band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
-      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 2 + b_width_log2(sb_type);
-      const int x = block_idx & ((1 << sz) - 1);
-      const int y = block_idx - x;
-      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
-      scan = get_scan_16x16(tx_type);
+      scan = get_scan_16x16(get_tx_type_16x16(type, xd));
       above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
-      default_eob = 256;
       band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
@@ -167,13 +141,12 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
       scan = vp9_default_scan_32x32;
       above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
-      default_eob = 1024;
       band_translate = vp9_coefband_trans_8x8plus;
       break;
   }
 
   pt = combine_entropy_contexts(above_ec, left_ec);
-  nb = vp9_get_coef_neighbors_handle(scan, &pad);
+  nb = vp9_get_coef_neighbors_handle(scan);
 
   while (1) {
     int val;
@@ -181,43 +154,26 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
     if (c >= seg_eob)
       break;
     if (c)
-      pt = vp9_get_coef_context(scan, nb, pad, token_cache,
-                                c, default_eob);
+      pt = get_coef_context(nb, token_cache, c);
     band = get_coef_band(band_translate, c);
     prob = coef_probs[band][pt];
-#if !CONFIG_BALANCED_COEFTREE
-    fc->eob_branch_counts[txfm_size][type][ref][band][pt]++;
+    counts->eob_branch[txfm_size][type][ref][band][pt]++;
     if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
       break;
 
 SKIP_START:
-#endif
     if (c >= seg_eob)
       break;
     if (c)
-      pt = vp9_get_coef_context(scan, nb, pad, token_cache,
-                                c, default_eob);
+      pt = get_coef_context(nb, token_cache, c);
     band = get_coef_band(band_translate, c);
     prob = coef_probs[band][pt];
 
     if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       ++c;
-#if CONFIG_BALANCED_COEFTREE
-      skip_eob_node = 1;
-      continue;
-#else
       goto SKIP_START;
-#endif
-    }
-#if CONFIG_BALANCED_COEFTREE
-    if (!skip_eob_node) {
-      fc->eob_branch_counts[txfm_size][type][ref][band][pt]++;
-      if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
-        break;
     }
-    skip_eob_node = 0;
-#endif
 
     // ONE_CONTEXT_NODE_0_
     if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) {
@@ -293,8 +249,8 @@ SKIP_START:
   return c;
 }
 
-static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) {
-  return vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+static int get_eob(struct segmentation *seg, int segment_id, int eob_max) {
+  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
 struct decode_block_args {
@@ -315,7 +271,7 @@ static void decode_block(int plane, int block,
   struct macroblockd_plane* pd = &xd->plane[plane];
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
   const TX_SIZE ss_tx_size = ss_txfrm_size / 2;
-  const int seg_eob = get_eob(xd, segment_id, 16 << ss_txfrm_size);
+  const int seg_eob = get_eob(&xd->seg, segment_id, 16 << ss_txfrm_size);
   const int off = block >> ss_txfrm_size;
   const int mod = bw - ss_tx_size - pd->subsampling_x;
   const int aoff = (off & ((1 << mod) - 1)) << ss_tx_size;
@@ -323,7 +279,7 @@ static void decode_block(int plane, int block,
 
   ENTROPY_CONTEXT *A = pd->above_context + aoff;
   ENTROPY_CONTEXT *L = pd->left_context + loff;
-  const int eob = decode_coefs(&arg->pbi->common.fc, xd, arg->r, block,
+  const int eob = decode_coefs(&arg->pbi->common, xd, arg->r, block,
                                pd->plane_type, seg_eob,
                                BLOCK_OFFSET(pd->qcoeff, block, 16),
                                ss_tx_size, pd->dequant, A, L);
diff --git a/libvpx/vp9/decoder/vp9_dsubexp.c b/libvpx/vp9/decoder/vp9_dsubexp.c
new file mode 100644
index 0000000..8cc64f7
--- /dev/null
+++ b/libvpx/vp9/decoder/vp9_dsubexp.c
@@ -0,0 +1,106 @@
+/*
+  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_entropy.h"
+
+#include "vp9/decoder/vp9_dsubexp.h"
+
+static int inv_recenter_nonneg(int v, int m) {
+  if (v > 2 * m)
+    return v;
+
+  return v % 2 ? m - (v + 1) / 2 : m + v / 2;
+}
+
+static int decode_uniform(vp9_reader *r, int n) {
+  int v;
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (!l)
+    return 0;
+
+  v = vp9_read_literal(r, l - 1);
+  return v < m ?  v : (v << 1) - m + vp9_read_bit(r);
+}
+
+
+static int merge_index(int v, int n, int modulus) {
+  int max1 = (n - 1 - modulus / 2) / modulus + 1;
+  if (v < max1) {
+    v = v * modulus + modulus / 2;
+  } else {
+    int w;
+    v -= max1;
+    w = v;
+    v += (v + modulus - modulus / 2) / modulus;
+    while (v % modulus == modulus / 2 ||
+           w != v - (v + modulus - modulus / 2) / modulus) v++;
+  }
+  return v;
+}
+
+static int inv_remap_prob(int v, int m) {
+  static int inv_map_table[MAX_PROB - 1] = {
+    // generated by:
+    //   inv_map_table[j] = merge_index(j, MAX_PROB - 1, MODULUS_PARAM);
+      6,  19,  32,  45,  58,  71,  84,  97, 110, 123, 136, 149, 162, 175, 188,
+    201, 214, 227, 240, 253,   0,   1,   2,   3,   4,   5,   7,   8,   9,  10,
+     11,  12,  13,  14,  15,  16,  17,  18,  20,  21,  22,  23,  24,  25,  26,
+     27,  28,  29,  30,  31,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
+     43,  44,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  59,
+     60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  72,  73,  74,  75,
+     76,  77,  78,  79,  80,  81,  82,  83,  85,  86,  87,  88,  89,  90,  91,
+     92,  93,  94,  95,  96,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
+    108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124,
+    125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140,
+    141, 142, 143, 144, 145, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156,
+    157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
+    173, 174, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189,
+    190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205,
+    206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221,
+    222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
+    238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+
+  };
+  // v = merge_index(v, MAX_PROBS - 1, MODULUS_PARAM);
+  v = inv_map_table[v];
+  m--;
+  if ((m << 1) <= MAX_PROB) {
+    return 1 + inv_recenter_nonneg(v + 1, m);
+  } else {
+    return MAX_PROB - inv_recenter_nonneg(v + 1, MAX_PROB - 1 - m);
+  }
+}
+
+static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {
+  int i = 0, mk = 0, word;
+  while (1) {
+    const int b = i ? k + i - 1 : k;
+    const int a = 1 << b;
+    if (num_syms <= mk + 3 * a) {
+      word = decode_uniform(r, num_syms - mk) + mk;
+      break;
+    } else {
+      if (vp9_read_bit(r)) {
+        i++;
+        mk += a;
+      } else {
+        word = vp9_read_literal(r, b) + mk;
+        break;
+      }
+    }
+  }
+  return word;
+}
+
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) {
+  int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
+  *p = (vp9_prob)inv_remap_prob(delp, *p);
+}
diff --git a/libvpx/vp9/encoder/vp9_asm_enc_offsets.c b/libvpx/vp9/decoder/vp9_dsubexp.h
index 921e8f0..aeb9399 100644
--- a/libvpx/vp9/encoder/vp9_asm_enc_offsets.c
+++ b/libvpx/vp9/decoder/vp9_dsubexp.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -9,9 +9,11 @@
  */
 
 
-#include "vpx_ports/asm_offsets.h"
+#ifndef VP9_DECODER_VP9_DSUBEXP_H_
+#define VP9_DECODER_VP9_DSUBEXP_H_
 
-BEGIN
+#include "vp9/decoder/vp9_dboolhuff.h"
 
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p);
 
-END
+#endif  // VP9_DECODER_VP9_DSUBEXP_H_
diff --git a/libvpx/vp9/decoder/vp9_idct_blk.c b/libvpx/vp9/decoder/vp9_idct_blk.c
index c52963c..0217919 100644
--- a/libvpx/vp9/decoder/vp9_idct_blk.c
+++ b/libvpx/vp9/decoder/vp9_idct_blk.c
@@ -66,7 +66,7 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
     vp9_short_idct4x4_add(input, dest, stride);
     vpx_memset(input, 0, 32);
   } else {
-    vp9_dc_only_idct_add(input[0], dest, dest, stride, stride);
+    vp9_short_idct4x4_1_add(input, dest, stride);
     ((int *)input)[0] = 0;
   }
 }
diff --git a/libvpx/vp9/decoder/vp9_onyxd_if.c b/libvpx/vp9/decoder/vp9_onyxd_if.c
index 3cef88b..cb72920 100644
--- a/libvpx/vp9/decoder/vp9_onyxd_if.c
+++ b/libvpx/vp9/decoder/vp9_onyxd_if.c
@@ -136,7 +136,7 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
   // vp9_init_dequantizer() for every frame.
   vp9_init_dequantizer(&pbi->common);
 
-  vp9_loop_filter_init(&pbi->common);
+  vp9_loop_filter_init(&pbi->common, &pbi->mb.lf);
 
   pbi->common.error.setjmp = 0;
   pbi->decoded_key_frame = 0;
@@ -154,7 +154,6 @@ void vp9_remove_decompressor(VP9D_PTR ptr) {
     vpx_free(pbi->common.last_frame_seg_map);
 
   vp9_remove_common(&pbi->common);
-  vpx_free(pbi->mbc);
   vpx_free(pbi);
 }
 
@@ -347,9 +346,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
                              cm->current_video_frame + 1000);
 #endif
 
-    if (cm->filter_level) {
+    if (!pbi->do_loopfilter_inline) {
       /* Apply the loop filter if appropriate. */
-      vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0);
+      vp9_loop_filter_frame(cm, &pbi->mb, pbi->mb.lf.filter_level, 0);
     }
 
 #if WRITE_RECON_BUFFER == 2
@@ -361,8 +360,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
                              cm->current_video_frame + 3000);
 #endif
 
-    vp9_extend_frame_borders(cm->frame_to_show,
-                             cm->subsampling_x, cm->subsampling_y);
+    vp9_extend_frame_inner_borders(cm->frame_to_show,
+                                   cm->subsampling_x,
+                                   cm->subsampling_y);
   }
 
 #if WRITE_RECON_BUFFER == 1
@@ -412,9 +412,8 @@ int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd,
   *time_stamp = pbi->last_time_stamp;
   *time_end_stamp = 0;
 
-  sd->clrtype = pbi->common.clr_type;
 #if CONFIG_POSTPROC
-  ret = vp9_post_proc_frame(&pbi->common, sd, flags);
+  ret = vp9_post_proc_frame(&pbi->common, &pbi->mb.lf, sd, flags);
 #else
 
   if (pbi->common.frame_to_show) {
diff --git a/libvpx/vp9/decoder/vp9_onyxd_int.h b/libvpx/vp9/decoder/vp9_onyxd_int.h
index 8698570..4760066 100644
--- a/libvpx/vp9/decoder/vp9_onyxd_int.h
+++ b/libvpx/vp9/decoder/vp9_onyxd_int.h
@@ -10,13 +10,14 @@
 
 #ifndef VP9_DECODER_VP9_ONYXD_INT_H_
 #define VP9_DECODER_VP9_ONYXD_INT_H_
+
 #include "./vpx_config.h"
-#include "vp9/decoder/vp9_onyxd.h"
-#include "vp9/decoder/vp9_treereader.h"
+
 #include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/decoder/vp9_idct_blk.h"
 
-// #define DEC_DEBUG
+#include "vp9/decoder/vp9_idct_blk.h"
+#include "vp9/decoder/vp9_onyxd.h"
+#include "vp9/decoder/vp9_treereader.h"
 
 typedef struct VP9Decompressor {
   DECLARE_ALIGNED(16, MACROBLOCKD, mb);
@@ -28,35 +29,17 @@ typedef struct VP9Decompressor {
   const uint8_t *source;
   uint32_t source_sz;
 
-  vp9_reader *mbc;
   int64_t last_time_stamp;
   int ready_for_new_data;
 
   int refresh_frame_flags;
-  vp9_prob prob_skip_false;
 
   int decoded_key_frame;
 
   int initial_width;
   int initial_height;
-} VP9D_COMP;
 
-
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval,expr) do {\
-    lval = (expr); \
-    if(!lval) \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
-                         "Failed to allocate "#lval" at %s:%d", \
-                         __FILE__,__LINE__);\
-  } while(0)
-#else
-#define CHECK_MEM_ERROR(lval,expr) do {\
-    lval = (expr); \
-    if(!lval) \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
-                         "Failed to allocate "#lval);\
-  } while(0)
-#endif
+  int do_loopfilter_inline;  // apply loopfilter to available rows immediately
+} VP9D_COMP;
 
 #endif  // VP9_DECODER_VP9_TREEREADER_H_
diff --git a/libvpx/vp9/decoder/vp9_read_bit_buffer.h b/libvpx/vp9/decoder/vp9_read_bit_buffer.h
index f243cb4..c7fa3aa 100644
--- a/libvpx/vp9/decoder/vp9_read_bit_buffer.h
+++ b/libvpx/vp9/decoder/vp9_read_bit_buffer.h
@@ -51,4 +51,10 @@ static int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits) {
   return value;
 }
 
+static int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb,
+                                      int bits) {
+  const int value = vp9_rb_read_literal(rb, bits);
+  return vp9_rb_read_bit(rb) ? -value : value;
+}
+
 #endif  // VP9_READ_BIT_BUFFER_
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index 09ab2db..ad0f6c5 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -32,6 +32,7 @@
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_bitstream.h"
 #include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_subexp.h"
 #include "vp9/encoder/vp9_write_bit_buffer.h"
 
 
@@ -48,8 +49,6 @@ vp9_coeff_stats tree_update_hist[TX_SIZE_MAX_SB][BLOCK_TYPES];
 extern unsigned int active_section;
 #endif
 
-#define vp9_cost_upd  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
-#define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
 
 #ifdef MODE_STATS
 int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];
@@ -155,8 +154,6 @@ void write_switchable_interp_stats() {
 }
 #endif
 
-static int update_bits[255];
-
 static INLINE void write_be32(uint8_t *p, int value) {
   p[0] = value >> 24;
   p[1] = value >> 16;
@@ -164,248 +161,11 @@ static INLINE void write_be32(uint8_t *p, int value) {
   p[3] = value;
 }
 
-
-
-int recenter_nonneg(int v, int m) {
-  if (v > (m << 1))
-    return v;
-  else if (v >= m)
-    return ((v - m) << 1);
-  else
-    return ((m - v) << 1) - 1;
-}
-
-static int get_unsigned_bits(unsigned num_values) {
-  int cat = 0;
-  if ((num_values--) <= 1) return 0;
-  while (num_values > 0) {
-    cat++;
-    num_values >>= 1;
-  }
-  return cat;
-}
-
 void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
                              int data, int max) {
   vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
-void encode_uniform(vp9_writer *w, int v, int n) {
-  int l = get_unsigned_bits(n);
-  int m;
-  if (l == 0)
-    return;
-  m = (1 << l) - n;
-  if (v < m) {
-    vp9_write_literal(w, v, l - 1);
-  } else {
-    vp9_write_literal(w, m + ((v - m) >> 1), l - 1);
-    vp9_write_literal(w, (v - m) & 1, 1);
-  }
-}
-
-int count_uniform(int v, int n) {
-  int l = get_unsigned_bits(n);
-  int m;
-  if (l == 0) return 0;
-  m = (1 << l) - n;
-  if (v < m)
-    return l - 1;
-  else
-    return l;
-}
-
-void encode_term_subexp(vp9_writer *w, int word, int k, int num_syms) {
-  int i = 0;
-  int mk = 0;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (num_syms <= mk + 3 * a) {
-      encode_uniform(w, word - mk, num_syms - mk);
-      break;
-    } else {
-      int t = (word >= mk + a);
-      vp9_write_literal(w, t, 1);
-      if (t) {
-        i = i + 1;
-        mk += a;
-      } else {
-        vp9_write_literal(w, word - mk, b);
-        break;
-      }
-    }
-  }
-}
-
-int count_term_subexp(int word, int k, int num_syms) {
-  int count = 0;
-  int i = 0;
-  int mk = 0;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (num_syms <= mk + 3 * a) {
-      count += count_uniform(word - mk, num_syms - mk);
-      break;
-    } else {
-      int t = (word >= mk + a);
-      count++;
-      if (t) {
-        i = i + 1;
-        mk += a;
-      } else {
-        count += b;
-        break;
-      }
-    }
-  }
-  return count;
-}
-
-static void compute_update_table() {
-  int i;
-  for (i = 0; i < 254; i++)
-    update_bits[i] = count_term_subexp(i, SUBEXP_PARAM, 255);
-}
-
-static int split_index(int i, int n, int modulus) {
-  int max1 = (n - 1 - modulus / 2) / modulus + 1;
-  if (i % modulus == modulus / 2) i = i / modulus;
-  else i = max1 + i - (i + modulus - modulus / 2) / modulus;
-  return i;
-}
-
-static int remap_prob(int v, int m) {
-  const int n = 255;
-  const int modulus = MODULUS_PARAM;
-  int i;
-  v--;
-  m--;
-  if ((m << 1) <= n)
-    i = recenter_nonneg(v, m) - 1;
-  else
-    i = recenter_nonneg(n - 1 - v, n - 1 - m) - 1;
-
-  i = split_index(i, n - 1, modulus);
-  return i;
-}
-
-static void write_prob_diff_update(vp9_writer *w,
-                                   vp9_prob newp, vp9_prob oldp) {
-  int delp = remap_prob(newp, oldp);
-  encode_term_subexp(w, delp, SUBEXP_PARAM, 255);
-}
-
-static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
-  int delp = remap_prob(newp, oldp);
-  return update_bits[delp] * 256;
-}
-
-static int prob_update_savings(const unsigned int *ct,
-                               const vp9_prob oldp, const vp9_prob newp,
-                               const vp9_prob upd) {
-  const int old_b = cost_branch256(ct, oldp);
-  const int new_b = cost_branch256(ct, newp);
-  const int update_b = 2048 + vp9_cost_upd256;
-  return old_b - new_b - update_b;
-}
-
-static int prob_diff_update_savings_search(const unsigned int *ct,
-                                           const vp9_prob oldp, vp9_prob *bestp,
-                                           const vp9_prob upd) {
-  const int old_b = cost_branch256(ct, oldp);
-  int new_b, update_b, savings, bestsavings, step;
-  vp9_prob newp, bestnewp;
-
-  bestsavings = 0;
-  bestnewp = oldp;
-
-  step = (*bestp > oldp ? -1 : 1);
-  for (newp = *bestp; newp != oldp; newp += step) {
-    new_b = cost_branch256(ct, newp);
-    update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;
-    savings = old_b - new_b - update_b;
-    if (savings > bestsavings) {
-      bestsavings = savings;
-      bestnewp = newp;
-    }
-  }
-  *bestp = bestnewp;
-  return bestsavings;
-}
-
-static int prob_diff_update_savings_search_model(const unsigned int *ct,
-                                                 const vp9_prob *oldp,
-                                                 vp9_prob *bestp,
-                                                 const vp9_prob upd,
-                                                 int b, int r) {
-  int i, old_b, new_b, update_b, savings, bestsavings, step;
-  int newp;
-  vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
-  vp9_model_to_full_probs(oldp, oldplist);
-  vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
-  for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
-    old_b += cost_branch256(ct + 2 * i, oldplist[i]);
-  old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
-
-  bestsavings = 0;
-  bestnewp = oldp[PIVOT_NODE];
-
-  step = (*bestp > oldp[PIVOT_NODE] ? -1 : 1);
-  newp = *bestp;
-  for (; newp != oldp[PIVOT_NODE]; newp += step) {
-    if (newp < 1 || newp > 255) continue;
-    newplist[PIVOT_NODE] = newp;
-    vp9_model_to_full_probs(newplist, newplist);
-    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
-      new_b += cost_branch256(ct + 2 * i, newplist[i]);
-    new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
-    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
-        vp9_cost_upd256;
-    savings = old_b - new_b - update_b;
-    if (savings > bestsavings) {
-      bestsavings = savings;
-      bestnewp = newp;
-    }
-  }
-  *bestp = bestnewp;
-  return bestsavings;
-}
-
-static void vp9_cond_prob_update(vp9_writer *bc, vp9_prob *oldp, vp9_prob upd,
-                                 unsigned int *ct) {
-  vp9_prob newp;
-  int savings;
-  newp = get_binary_prob(ct[0], ct[1]);
-  assert(newp >= 1);
-  savings = prob_update_savings(ct, *oldp, newp, upd);
-  if (savings > 0) {
-    vp9_write(bc, 1, upd);
-    vp9_write_prob(bc, newp);
-    *oldp = newp;
-  } else {
-    vp9_write(bc, 0, upd);
-  }
-}
-
-static void vp9_cond_prob_diff_update(vp9_writer *bc, vp9_prob *oldp,
-                                      vp9_prob upd,
-                                      unsigned int *ct) {
-  vp9_prob newp;
-  int savings;
-  newp = get_binary_prob(ct[0], ct[1]);
-  assert(newp >= 1);
-  savings = prob_diff_update_savings_search(ct, *oldp, &newp, upd);
-  if (savings > 0) {
-    vp9_write(bc, 1, upd);
-    write_prob_diff_update(bc, newp, *oldp);
-    *oldp = newp;
-  } else {
-    vp9_write(bc, 0, upd);
-  }
-}
-
 static void update_mode(
   vp9_writer *w,
   int n,
@@ -440,16 +200,39 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi,
                 (unsigned int *)cpi->y_mode_count[j]);
 }
 
-void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *bc) {
-  VP9_COMMON *const pc = &cpi->common;
-  int k;
+static void write_selected_txfm_size(const VP9_COMP *cpi, TX_SIZE tx_size,
+                                     BLOCK_SIZE_TYPE bsize, vp9_writer *w) {
+  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs);
+  vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
+  if (bsize >= BLOCK_SIZE_MB16X16 && tx_size != TX_4X4) {
+    vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
+    if (bsize >= BLOCK_SIZE_SB32X32 && tx_size != TX_8X8)
+      vp9_write(w, tx_size != TX_16X16, tx_probs[2]);
+  }
+}
 
-  for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
-    vp9_cond_prob_diff_update(bc, &pc->fc.mbskip_probs[k],
-                              VP9_MODE_UPDATE_PROB, pc->fc.mbskip_count[k]);
+static int write_skip_coeff(const VP9_COMP *cpi, int segment_id, MODE_INFO *m,
+                            vp9_writer *w) {
+  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int skip_coeff = m->mbmi.mb_skip_coeff;
+    vp9_write(w, skip_coeff, vp9_get_pred_prob_mbskip(&cpi->common, xd));
+    return skip_coeff;
   }
 }
 
+void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) {
+  VP9_COMMON *cm = &cpi->common;
+  int k;
+
+  for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+    vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k],
+                              VP9_MODE_UPDATE_PROB, cm->counts.mbskip[k]);
+}
+
 static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
 }
@@ -465,7 +248,7 @@ static void update_switchable_interp_probs(VP9_COMP *const cpi,
     vp9_tree_probs_from_distribution(
         vp9_switchable_interp_tree,
         new_prob[j], branch_ct[j],
-        pc->fc.switchable_interp_count[j], 0);
+        pc->counts.switchable_interp[j], 0);
   }
   for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
     for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
@@ -486,7 +269,7 @@ static void update_inter_mode_probs(VP9_COMMON *pc, vp9_writer* const bc) {
     for (j = 0; j < VP9_INTER_MODES - 1; j++) {
       vp9_cond_prob_diff_update(bc, &pc->fc.inter_mode_probs[i][j],
                                 VP9_MODE_UPDATE_PROB,
-                                pc->fc.inter_mode_counts[i][j]);
+                                pc->counts.inter_mode[i][j]);
     }
   }
 }
@@ -519,22 +302,13 @@ static void pack_mb_tokens(vp9_writer* const bc,
     assert(pp != 0);
 
     /* skip one or two nodes */
-#if !CONFIG_BALANCED_COEFTREE
     if (p->skip_eob_node) {
       n -= p->skip_eob_node;
       i = 2 * p->skip_eob_node;
     }
-#endif
 
     do {
       const int bb = (v >> --n) & 1;
-#if CONFIG_BALANCED_COEFTREE
-      if (i == 2 && p->skip_eob_node) {
-        i += 2;
-        assert(bb == 1);
-        continue;
-      }
-#endif
       vp9_write(bc, bb, pp[i >> 1]);
       i = vp9_coef_tree[i + bb];
     } while (n);
@@ -563,22 +337,18 @@ static void pack_mb_tokens(vp9_writer* const bc,
   *tp = p;
 }
 
-static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,
+static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode,
                             const vp9_prob *p) {
-#if CONFIG_DEBUG
-  assert(NEARESTMV <= m && m <= NEWMV);
-#endif
-  write_token(bc, vp9_sb_mv_ref_tree, p,
-              vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
+  assert(is_inter_mode(mode));
+  write_token(w, vp9_inter_mode_tree, p,
+              &vp9_inter_mode_encodings[mode - NEARESTMV]);
 }
 
-// This function writes the current macro block's segnment id to the bitstream
-// It should only be called if a segment map update is indicated.
-static void write_mb_segid(vp9_writer *bc,
-                           const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {
-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map)
-    treed_write(bc, vp9_segment_tree, xd->mb_segment_tree_probs,
-                mi->segment_id, 3);
+
+static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
+                             int segment_id) {
+  if (seg->enabled && seg->update_map)
+    treed_write(w, vp9_segment_tree, seg->tree_probs, segment_id, 3);
 }
 
 // This function encodes the reference frame
@@ -588,7 +358,7 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mi = &xd->mode_info_context->mbmi;
   const int segment_id = mi->segment_id;
-  int seg_ref_active = vp9_segfeature_active(xd, segment_id,
+  int seg_ref_active = vp9_segfeature_active(&xd->seg, segment_id,
                                              SEG_LVL_REF_FRAME);
   // If segment level coding of this signal is disabled...
   // or the segment allows multiple reference frame options
@@ -597,7 +367,7 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
     // (if not specified at the frame/segment level)
     if (pc->comp_pred_mode == HYBRID_PREDICTION) {
       vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
-                vp9_get_pred_prob(pc, xd, PRED_COMP_INTER_INTER));
+                vp9_get_pred_prob_comp_inter_inter(pc, xd));
     } else {
       assert((mi->ref_frame[1] <= INTRA_FRAME) ==
                  (pc->comp_pred_mode == SINGLE_PREDICTION_ONLY));
@@ -605,17 +375,17 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
 
     if (mi->ref_frame[1] > INTRA_FRAME) {
       vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME,
-                vp9_get_pred_prob(pc, xd, PRED_COMP_REF_P));
+                vp9_get_pred_prob_comp_ref_p(pc, xd));
     } else {
       vp9_write(bc, mi->ref_frame[0] != LAST_FRAME,
-                vp9_get_pred_prob(pc, xd, PRED_SINGLE_REF_P1));
+                vp9_get_pred_prob_single_ref_p1(pc, xd));
       if (mi->ref_frame[0] != LAST_FRAME)
         vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME,
-                  vp9_get_pred_prob(pc, xd, PRED_SINGLE_REF_P2));
+                  vp9_get_pred_prob_single_ref_p2(pc, xd));
     }
   } else {
     assert(mi->ref_frame[1] <= INTRA_FRAME);
-    assert(vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME) ==
+    assert(vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) ==
            mi->ref_frame[0]);
   }
 
@@ -629,60 +399,42 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
   const nmv_context *nmvc = &pc->fc.nmvc;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  struct segmentation *seg = &xd->seg;
   MB_MODE_INFO *const mi = &m->mbmi;
   const MV_REFERENCE_FRAME rf = mi->ref_frame[0];
   const MB_PREDICTION_MODE mode = mi->mode;
   const int segment_id = mi->segment_id;
   int skip_coeff;
+  const BLOCK_SIZE_TYPE bsize = mi->sb_type;
 
-  xd->prev_mode_info_context = pc->prev_mi + (m - pc->mi);
   x->partition_info = x->pi + (m - pc->mi);
 
 #ifdef ENTROPY_STATS
   active_section = 9;
 #endif
 
-  if (cpi->mb.e_mbd.update_mb_segmentation_map) {
-    // Is temporal coding of the segment map enabled
-    if (pc->temporal_update) {
-      unsigned char prediction_flag = vp9_get_pred_flag(xd, PRED_SEG_ID);
-      vp9_prob pred_prob = vp9_get_pred_prob(pc, xd, PRED_SEG_ID);
-
-      // Code the segment id prediction flag for this mb
-      vp9_write(bc, prediction_flag, pred_prob);
-
-      // If the mb segment id wasn't predicted code explicitly
-      if (!prediction_flag)
-        write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+  if (seg->update_map) {
+    if (seg->temporal_update) {
+      const int pred_flag = mi->seg_id_predicted;
+      vp9_prob pred_prob = vp9_get_pred_prob_seg_id(xd);
+      vp9_write(bc, pred_flag, pred_prob);
+      if (!pred_flag)
+        write_segment_id(bc, seg, segment_id);
     } else {
-      // Normal unpredicted coding
-      write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+      write_segment_id(bc, seg, segment_id);
     }
   }
 
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
-    skip_coeff = 1;
-  } else {
-    skip_coeff = m->mbmi.mb_skip_coeff;
-    vp9_write(bc, skip_coeff,
-              vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
-  }
+  skip_coeff = write_skip_coeff(cpi, segment_id, m, bc);
 
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME))
+  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
     vp9_write(bc, rf != INTRA_FRAME,
-              vp9_get_pred_prob(pc, xd, PRED_INTRA_INTER));
+              vp9_get_pred_prob_intra_inter(pc, xd));
 
-  if (mi->sb_type >= BLOCK_SIZE_SB8X8 && pc->txfm_mode == TX_MODE_SELECT &&
+  if (bsize >= BLOCK_SIZE_SB8X8 && pc->tx_mode == TX_MODE_SELECT &&
       !(rf != INTRA_FRAME &&
-        (skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
-    TX_SIZE sz = mi->txfm_size;
-    const vp9_prob *tx_probs = vp9_get_pred_probs(pc, xd, PRED_TX_SIZE);
-    vp9_write(bc, sz != TX_4X4, tx_probs[0]);
-    if (mi->sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
-      vp9_write(bc, sz != TX_8X8, tx_probs[1]);
-      if (mi->sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
-        vp9_write(bc, sz != TX_16X16, tx_probs[2]);
-    }
+        (skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
+    write_selected_txfm_size(cpi, mi->txfm_size, bsize, bc);
   }
 
   if (rf == INTRA_FRAME) {
@@ -690,28 +442,24 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
     active_section = 6;
 #endif
 
-    if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
-      const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+    if (bsize >= BLOCK_SIZE_SB8X8) {
       const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
       const int bsl = MIN(bwl, bhl);
       write_intra_mode(bc, mode, pc->fc.y_mode_prob[MIN(3, bsl)]);
     } else {
       int idx, idy;
-      int bw = 1 << b_width_log2(mi->sb_type);
-      int bh = 1 << b_height_log2(mi->sb_type);
-      for (idy = 0; idy < 2; idy += bh)
-        for (idx = 0; idx < 2; idx += bw) {
-          MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode.first;
+      int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+      int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+      for (idy = 0; idy < 2; idy += num_4x4_blocks_high)
+        for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+          const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode;
           write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]);
         }
     }
-    write_intra_mode(bc, mi->uv_mode,
-                     pc->fc.uv_mode_prob[mode]);
+    write_intra_mode(bc, mi->uv_mode, pc->fc.uv_mode_prob[mode]);
   } else {
     vp9_prob *mv_ref_p;
-
     encode_ref_frame(cpi, bc);
-
     mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mb_mode_context[rf]];
 
 #ifdef ENTROPY_STATS
@@ -719,8 +467,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 #endif
 
     // If segment skip is not enabled code the mode.
-    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
-      if (mi->sb_type >= BLOCK_SIZE_SB8X8) {
+    if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+      if (bsize >= BLOCK_SIZE_SB8X8) {
         write_sb_mv_ref(bc, mode, mv_ref_p);
         vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
       }
@@ -728,38 +476,37 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 
     if (cpi->common.mcomp_filter_type == SWITCHABLE) {
       write_token(bc, vp9_switchable_interp_tree,
-                  vp9_get_pred_probs(&cpi->common, xd,
-                                     PRED_SWITCHABLE_INTERP),
+                  vp9_get_pred_probs_switchable_interp(&cpi->common, xd),
                   vp9_switchable_interp_encodings +
                   vp9_switchable_interp_map[mi->interp_filter]);
     } else {
       assert(mi->interp_filter == cpi->common.mcomp_filter_type);
     }
 
-    if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+    if (bsize < BLOCK_SIZE_SB8X8) {
       int j;
       MB_PREDICTION_MODE blockmode;
       int_mv blockmv;
-      int bwl = b_width_log2(mi->sb_type), bw = 1 << bwl;
-      int bhl = b_height_log2(mi->sb_type), bh = 1 << bhl;
+      int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+      int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
       int idx, idy;
-      for (idy = 0; idy < 2; idy += bh) {
-        for (idx = 0; idx < 2; idx += bw) {
+      for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+        for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
           j = idy * 2 + idx;
-          blockmode = cpi->mb.partition_info->bmi[j].mode;
-          blockmv = cpi->mb.partition_info->bmi[j].mv;
+          blockmode = x->partition_info->bmi[j].mode;
+          blockmv = m->bmi[j].as_mv[0];
           write_sb_mv_ref(bc, blockmode, mv_ref_p);
           vp9_accum_mv_refs(&cpi->common, blockmode, mi->mb_mode_context[rf]);
           if (blockmode == NEWMV) {
 #ifdef ENTROPY_STATS
             active_section = 11;
 #endif
-            vp9_encode_mv(bc, &blockmv.as_mv, &mi->best_mv.as_mv,
+            vp9_encode_mv(cpi, bc, &blockmv.as_mv, &mi->best_mv.as_mv,
                           nmvc, xd->allow_high_precision_mv);
 
             if (mi->ref_frame[1] > INTRA_FRAME)
-              vp9_encode_mv(bc,
-                            &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
+              vp9_encode_mv(cpi, bc,
+                            &m->bmi[j].as_mv[1].as_mv,
                             &mi->best_second_mv.as_mv,
                             nmvc, xd->allow_high_precision_mv);
           }
@@ -769,12 +516,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 #ifdef ENTROPY_STATS
       active_section = 5;
 #endif
-      vp9_encode_mv(bc,
+      vp9_encode_mv(cpi, bc,
                     &mi->mv[0].as_mv, &mi->best_mv.as_mv,
                     nmvc, xd->allow_high_precision_mv);
 
       if (mi->ref_frame[1] > INTRA_FRAME)
-        vp9_encode_mv(bc,
+        vp9_encode_mv(cpi, bc,
                       &mi->mv[1].as_mv, &mi->best_second_mv.as_mv,
                       nmvc, xd->allow_high_precision_mv);
     }
@@ -789,54 +536,40 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
   const int ym = m->mbmi.mode;
   const int mis = c->mode_info_stride;
   const int segment_id = m->mbmi.segment_id;
-  int skip_coeff;
 
-  if (xd->update_mb_segmentation_map)
-    write_mb_segid(bc, &m->mbmi, xd);
+  if (xd->seg.update_map)
+    write_segment_id(bc, &xd->seg, m->mbmi.segment_id);
 
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
-    skip_coeff = 1;
-  } else {
-    skip_coeff = m->mbmi.mb_skip_coeff;
-    vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP));
-  }
+  write_skip_coeff(cpi, segment_id, m, bc);
 
-  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->txfm_mode == TX_MODE_SELECT) {
-    TX_SIZE sz = m->mbmi.txfm_size;
-    const vp9_prob *tx_probs = vp9_get_pred_probs(c, xd, PRED_TX_SIZE);
-    vp9_write(bc, sz != TX_4X4, tx_probs[0]);
-    if (m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
-      vp9_write(bc, sz != TX_8X8, tx_probs[1]);
-      if (m->mbmi.sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
-        vp9_write(bc, sz != TX_16X16, tx_probs[2]);
-    }
-  }
+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->tx_mode == TX_MODE_SELECT)
+    write_selected_txfm_size(cpi, m->mbmi.txfm_size, m->mbmi.sb_type, bc);
 
   if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
     const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
     const MB_PREDICTION_MODE L = xd->left_available ?
                                  left_block_mode(m, 0) : DC_PRED;
-    write_intra_mode(bc, ym, c->kf_y_mode_prob[A][L]);
+    write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]);
   } else {
     int idx, idy;
-    int bw = 1 << b_width_log2(m->mbmi.sb_type);
-    int bh = 1 << b_height_log2(m->mbmi.sb_type);
-    for (idy = 0; idy < 2; idy += bh) {
-      for (idx = 0; idx < 2; idx += bw) {
+    int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[m->mbmi.sb_type];
+    int num_4x4_blocks_high = num_4x4_blocks_high_lookup[m->mbmi.sb_type];
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
         int i = idy * 2 + idx;
         const MB_PREDICTION_MODE A = above_block_mode(m, i, mis);
         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
                                      left_block_mode(m, i) : DC_PRED;
-        const int bm = m->bmi[i].as_mode.first;
+        const int bm = m->bmi[i].as_mode;
 #ifdef ENTROPY_STATS
         ++intra_mode_stats[A][L][bm];
 #endif
-        write_intra_mode(bc, bm, c->kf_y_mode_prob[A][L]);
+        write_intra_mode(bc, bm, vp9_kf_y_mode_prob[A][L]);
       }
     }
   }
 
-  write_intra_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
+  write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]);
 }
 
 static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
@@ -875,30 +608,16 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   const int mis = cm->mode_info_stride;
-  int bwl, bhl;
   int bsl = b_width_log2(bsize);
   int bs = (1 << bsl) / 4;  // mode_info step for subsize
   int n;
-  PARTITION_TYPE partition;
+  PARTITION_TYPE partition = PARTITION_NONE;
   BLOCK_SIZE_TYPE subsize;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  bwl = b_width_log2(m->mbmi.sb_type);
-  bhl = b_height_log2(m->mbmi.sb_type);
-
-  // parse the partition type
-  if ((bwl == bsl) && (bhl == bsl))
-    partition = PARTITION_NONE;
-  else if ((bwl == bsl) && (bhl < bsl))
-    partition = PARTITION_HORZ;
-  else if ((bwl < bsl) && (bhl == bsl))
-    partition = PARTITION_VERT;
-  else if ((bwl < bsl) && (bhl < bsl))
-    partition = PARTITION_SPLIT;
-  else
-    assert(0);
+  partition = partition_lookup[bsl][m->mbmi.sb_type];
 
   if (bsize < BLOCK_SIZE_SB8X8)
     if (xd->ab_index > 0)
@@ -906,9 +625,8 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
 
   if (bsize >= BLOCK_SIZE_SB8X8) {
     int pl;
-    int idx = check_bsize_coverage(cm, xd, mi_row, mi_col, bsize);
-    xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
-    xd->above_seg_context = cm->above_seg_context + mi_col;
+    const int idx = check_bsize_coverage(cm, xd, mi_row, mi_col, bsize);
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
     // encode the partition information
     if (idx == 0)
@@ -968,14 +686,12 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
 
   m_ptr += c->cur_tile_mi_col_start + c->cur_tile_mi_row_start * mis;
 
-  for (mi_row = c->cur_tile_mi_row_start;
-       mi_row < c->cur_tile_mi_row_end;
+  for (mi_row = c->cur_tile_mi_row_start; mi_row < c->cur_tile_mi_row_end;
        mi_row += 8, m_ptr += 8 * mis) {
     m = m_ptr;
-    vpx_memset(c->left_seg_context, 0, sizeof(c->left_seg_context));
-    for (mi_col = c->cur_tile_mi_col_start;
-         mi_col < c->cur_tile_mi_col_end;
-         mi_col += 64 / MI_SIZE, m += 64 / MI_SIZE)
+    vp9_zero(c->left_seg_context);
+    for (mi_col = c->cur_tile_mi_col_start; mi_col < c->cur_tile_mi_col_end;
+         mi_col += MI_BLOCK_SIZE, m += MI_BLOCK_SIZE)
       write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col,
                      BLOCK_SIZE_SB64X64);
   }
@@ -1014,7 +730,7 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE txfm_size) {
   vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[txfm_size];
   vp9_coeff_count *coef_counts = cpi->coef_counts[txfm_size];
   unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
-      cpi->common.fc.eob_branch_counts[txfm_size];
+      cpi->common.counts.eob_branch[txfm_size];
   vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[txfm_size];
   vp9_prob full_probs[ENTROPY_NODES];
   int i, j, k, l;
@@ -1031,19 +747,11 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE txfm_size) {
                                            coef_counts[i][j][k][l], 0);
           vpx_memcpy(coef_probs[i][j][k][l], full_probs,
                      sizeof(vp9_prob) * UNCONSTRAINED_NODES);
-#if CONFIG_BALANCED_COEFTREE
-          coef_branch_ct[i][j][k][l][1][1] = eob_branch_ct[i][j][k][l] -
-                                             coef_branch_ct[i][j][k][l][1][0];
-          coef_probs[i][j][k][l][1] =
-              get_binary_prob(coef_branch_ct[i][j][k][l][1][0],
-                              coef_branch_ct[i][j][k][l][1][1]);
-#else
           coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
                                              coef_branch_ct[i][j][k][l][0][0];
           coef_probs[i][j][k][l][0] =
               get_binary_prob(coef_branch_ct[i][j][k][l][0][0],
                               coef_branch_ct[i][j][k][l][0][1]);
-#endif
 #ifdef ENTROPY_STATS
           if (!cpi->dummy_packing) {
             int t;
@@ -1096,11 +804,11 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
             if (l >= 3 && k == 0)
               continue;
             if (t == PIVOT_NODE)
-              s = prob_diff_update_savings_search_model(
+              s = vp9_prob_diff_update_savings_search_model(
                   frame_branch_ct[i][j][k][l][0],
                   old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
             else
-              s = prob_diff_update_savings_search(
+              s = vp9_prob_diff_update_savings_search(
                   frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
             if (s > 0 && newp != oldp)
               u = 1;
@@ -1137,11 +845,11 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
             if (l >= 3 && k == 0)
               continue;
             if (t == PIVOT_NODE)
-              s = prob_diff_update_savings_search_model(
+              s = vp9_prob_diff_update_savings_search_model(
                   frame_branch_ct[i][j][k][l][0],
                   old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
             else
-              s = prob_diff_update_savings_search(
+              s = vp9_prob_diff_update_savings_search(
                   frame_branch_ct[i][j][k][l][t],
                   *oldp, &newp, upd);
             if (s > 0 && newp != *oldp)
@@ -1153,7 +861,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
 #endif
             if (u) {
               /* send/use new probability */
-              write_prob_diff_update(bc, newp, *oldp);
+              vp9_write_prob_diff_update(bc, newp, *oldp);
               *oldp = newp;
             }
           }
@@ -1164,7 +872,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
 }
 
 static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
-  const TXFM_MODE txfm_mode = cpi->common.txfm_mode;
+  const TX_MODE tx_mode = cpi->common.tx_mode;
 
   vp9_clear_system_state();
 
@@ -1174,39 +882,39 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
   update_coef_probs_common(bc, cpi, TX_4X4);
 
   // do not do this if not even allowed
-  if (txfm_mode > ONLY_4X4)
+  if (tx_mode > ONLY_4X4)
     update_coef_probs_common(bc, cpi, TX_8X8);
 
-  if (txfm_mode > ALLOW_8X8)
+  if (tx_mode > ALLOW_8X8)
     update_coef_probs_common(bc, cpi, TX_16X16);
 
-  if (txfm_mode > ALLOW_16X16)
+  if (tx_mode > ALLOW_16X16)
     update_coef_probs_common(bc, cpi, TX_32X32);
 }
 
-static void encode_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd,
+static void encode_loopfilter(struct loopfilter *lf,
                               struct vp9_write_bit_buffer *wb) {
   int i;
 
   // Encode the loop filter level and type
-  vp9_wb_write_literal(wb, pc->filter_level, 6);
-  vp9_wb_write_literal(wb, pc->sharpness_level, 3);
+  vp9_wb_write_literal(wb, lf->filter_level, 6);
+  vp9_wb_write_literal(wb, lf->sharpness_level, 3);
 
   // Write out loop filter deltas applied at the MB level based on mode or
   // ref frame (if they are enabled).
-  vp9_wb_write_bit(wb, xd->mode_ref_lf_delta_enabled);
+  vp9_wb_write_bit(wb, lf->mode_ref_delta_enabled);
 
-  if (xd->mode_ref_lf_delta_enabled) {
+  if (lf->mode_ref_delta_enabled) {
     // Do the deltas need to be updated
-    vp9_wb_write_bit(wb, xd->mode_ref_lf_delta_update);
-    if (xd->mode_ref_lf_delta_update) {
+    vp9_wb_write_bit(wb, lf->mode_ref_delta_update);
+    if (lf->mode_ref_delta_update) {
       // Send update
       for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
-        const int delta = xd->ref_lf_deltas[i];
+        const int delta = lf->ref_deltas[i];
 
         // Frame level data
-        if (delta != xd->last_ref_lf_deltas[i]) {
-          xd->last_ref_lf_deltas[i] = delta;
+        if (delta != lf->last_ref_deltas[i]) {
+          lf->last_ref_deltas[i] = delta;
           vp9_wb_write_bit(wb, 1);
 
           assert(delta != 0);
@@ -1219,9 +927,9 @@ static void encode_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd,
 
       // Send update
       for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
-        const int delta = xd->mode_lf_deltas[i];
-        if (delta != xd->last_mode_lf_deltas[i]) {
-          xd->last_mode_lf_deltas[i] = delta;
+        const int delta = lf->mode_deltas[i];
+        if (delta != lf->last_mode_deltas[i]) {
+          lf->last_mode_deltas[i] = delta;
           vp9_wb_write_bit(wb, 1);
 
           assert(delta != 0);
@@ -1255,23 +963,23 @@ static void encode_quantization(VP9_COMMON *cm,
 
 
 static void encode_segmentation(VP9_COMP *cpi,
-                               struct vp9_write_bit_buffer *wb) {
+                                struct vp9_write_bit_buffer *wb) {
   int i, j;
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
-  vp9_wb_write_bit(wb, xd->segmentation_enabled);
-  if (!xd->segmentation_enabled)
+  struct segmentation *seg = &cpi->mb.e_mbd.seg;
+
+  vp9_wb_write_bit(wb, seg->enabled);
+  if (!seg->enabled)
     return;
 
   // Segmentation map
-  vp9_wb_write_bit(wb, xd->update_mb_segmentation_map);
-  if (xd->update_mb_segmentation_map) {
+  vp9_wb_write_bit(wb, seg->update_map);
+  if (seg->update_map) {
     // Select the coding strategy (temporal or spatial)
     vp9_choose_segmap_coding_method(cpi);
     // Write out probabilities used to decode unpredicted  macro-block segments
-    for (i = 0; i < MB_SEG_TREE_PROBS; i++) {
-      const int prob = xd->mb_segment_tree_probs[i];
+    for (i = 0; i < SEG_TREE_PROBS; i++) {
+      const int prob = seg->tree_probs[i];
       const int update = prob != MAX_PROB;
       vp9_wb_write_bit(wb, update);
       if (update)
@@ -1279,10 +987,10 @@ static void encode_segmentation(VP9_COMP *cpi,
     }
 
     // Write out the chosen coding method.
-    vp9_wb_write_bit(wb, cm->temporal_update);
-    if (cm->temporal_update) {
+    vp9_wb_write_bit(wb, seg->temporal_update);
+    if (seg->temporal_update) {
       for (i = 0; i < PREDICTION_PROBS; i++) {
-        const int prob = cm->segment_pred_probs[i];
+        const int prob = seg->pred_probs[i];
         const int update = prob != MAX_PROB;
         vp9_wb_write_bit(wb, update);
         if (update)
@@ -1292,16 +1000,16 @@ static void encode_segmentation(VP9_COMP *cpi,
   }
 
   // Segmentation data
-  vp9_wb_write_bit(wb, xd->update_mb_segmentation_data);
-  if (xd->update_mb_segmentation_data) {
-    vp9_wb_write_bit(wb, xd->mb_segment_abs_delta);
+  vp9_wb_write_bit(wb, seg->update_data);
+  if (seg->update_data) {
+    vp9_wb_write_bit(wb, seg->abs_delta);
 
-    for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+    for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
-        const int active = vp9_segfeature_active(xd, i, j);
+        const int active = vp9_segfeature_active(seg, i, j);
         vp9_wb_write_bit(wb, active);
         if (active) {
-          const int data = vp9_get_segdata(xd, i, j);
+          const int data = vp9_get_segdata(seg, i, j);
           const int data_max = vp9_seg_feature_data_max(j);
 
           if (vp9_is_segfeature_signed(j)) {
@@ -1321,12 +1029,12 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
   VP9_COMMON *const cm = &cpi->common;
 
   // Mode
-  vp9_write_literal(w, MIN(cm->txfm_mode, ALLOW_32X32), 2);
-  if (cm->txfm_mode >= ALLOW_32X32)
-    vp9_write_bit(w, cm->txfm_mode == TX_MODE_SELECT);
+  vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
+  if (cm->tx_mode >= ALLOW_32X32)
+    vp9_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
 
   // Probabilities
-  if (cm->txfm_mode == TX_MODE_SELECT) {
+  if (cm->tx_mode == TX_MODE_SELECT) {
     int i, j;
     unsigned int ct_8x8p[TX_SIZE_MAX_SB - 3][2];
     unsigned int ct_16x16p[TX_SIZE_MAX_SB - 2][2];
@@ -1334,28 +1042,26 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
 
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_8x8(cm->fc.tx_count_8x8p[i],
+      tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i],
                                      ct_8x8p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 3; j++) {
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_8x8p[i][j],
+      for (j = 0; j < TX_SIZE_MAX_SB - 3; j++)
+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j],
                                   VP9_MODE_UPDATE_PROB, ct_8x8p[j]);
-      }
     }
+
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_16x16(cm->fc.tx_count_16x16p[i],
+      tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i],
                                        ct_16x16p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 2; j++) {
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_16x16p[i][j],
+      for (j = 0; j < TX_SIZE_MAX_SB - 2; j++)
+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
                                   VP9_MODE_UPDATE_PROB, ct_16x16p[j]);
-      }
     }
+
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_32x32(cm->fc.tx_count_32x32p[i],
-                                       ct_32x32p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 1; j++) {
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_32x32p[i][j],
+      tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
+      for (j = 0; j < TX_SIZE_MAX_SB - 1; j++)
+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
                                   VP9_MODE_UPDATE_PROB, ct_32x32p[j]);
-      }
     }
 #ifdef MODE_STATS
     if (!cpi->dummy_packing)
@@ -1381,7 +1087,7 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) {
     for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
       count[i] = 0;
       for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j)
-        count[i] += cm->fc.switchable_interp_count[j][i];
+        count[i] += cm->counts.switchable_interp[j][i];
       c += (count[i] > 0);
     }
     if (c == 1) {
@@ -1397,18 +1103,18 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) {
 }
 
 static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) {
-  int min_log2_tiles, delta_log2_tiles, n_tile_bits, n;
-  vp9_get_tile_n_bits(cm, &min_log2_tiles, &delta_log2_tiles);
-  n_tile_bits = cm->log2_tile_columns - min_log2_tiles;
-  for (n = 0; n < delta_log2_tiles; n++) {
-    if (n_tile_bits--) {
-      vp9_wb_write_bit(wb, 1);
-    } else {
-      vp9_wb_write_bit(wb, 0);
-      break;
-    }
-  }
+  int min_log2_tile_cols, max_log2_tile_cols, ones;
+  vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
+  // columns
+  ones = cm->log2_tile_cols - min_log2_tile_cols;
+  while (ones--)
+    vp9_wb_write_bit(wb, 1);
+
+  if (cm->log2_tile_cols < max_log2_tile_cols)
+    vp9_wb_write_bit(wb, 0);
+
+  // rows
   vp9_wb_write_bit(wb, cm->log2_tile_rows != 0);
   if (cm->log2_tile_rows != 0)
     vp9_wb_write_bit(wb, cm->log2_tile_rows != 1);
@@ -1449,6 +1155,57 @@ static int get_refresh_mask(VP9_COMP *cpi) {
     }
 }
 
+static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
+  VP9_COMMON *const cm = &cpi->common;
+  vp9_writer residual_bc;
+
+  int tile_row, tile_col;
+  TOKENEXTRA *tok[4][1 << 6], *tok_end;
+  size_t total_size = 0;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+
+  vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+             mi_cols_aligned_to_sb(cm->mi_cols));
+
+  tok[0][0] = cpi->tok;
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    if (tile_row)
+      tok[tile_row][0] = tok[tile_row - 1][tile_cols - 1] +
+                         cpi->tok_count[tile_row - 1][tile_cols - 1];
+
+    for (tile_col = 1; tile_col < tile_cols; tile_col++)
+      tok[tile_row][tile_col] = tok[tile_row][tile_col - 1] +
+                                cpi->tok_count[tile_row][tile_col - 1];
+  }
+
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    vp9_get_tile_row_offsets(cm, tile_row);
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      vp9_get_tile_col_offsets(cm, tile_col);
+      tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
+
+      if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
+        vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
+      else
+        vp9_start_encode(&residual_bc, data_ptr + total_size);
+
+      write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end);
+      assert(tok[tile_row][tile_col] == tok_end);
+      vp9_stop_encode(&residual_bc);
+      if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
+        // size of this tile
+        write_be32(data_ptr + total_size, residual_bc.pos);
+        total_size += 4;
+      }
+
+      total_size += residual_bc.pos;
+    }
+  }
+
+  return total_size;
+}
+
 static void write_display_size(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
 
@@ -1562,7 +1319,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
       int i;
       vp9_wb_write_literal(wb, get_refresh_mask(cpi), NUM_REF_FRAMES);
       for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
-        vp9_wb_write_literal(wb, refs[i], NUM_REF_FRAMES_LG2);
+        vp9_wb_write_literal(wb, refs[i], NUM_REF_FRAMES_LOG2);
         vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[LAST_FRAME + i]);
       }
 
@@ -1580,66 +1337,27 @@ static void write_uncompressed_header(VP9_COMP *cpi,
     vp9_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
   }
 
-  vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LG2);
+  vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LOG2);
 
-  encode_loopfilter(cm, xd, wb);
+  encode_loopfilter(&xd->lf, wb);
   encode_quantization(cm, wb);
   encode_segmentation(cpi, wb);
 
   write_tile_info(cm, wb);
 }
 
-void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
-  int i, bytes_packed;
-  VP9_COMMON *const pc = &cpi->common;
-  vp9_writer header_bc, residual_bc;
+static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  FRAME_CONTEXT *const fc = &cm->fc;
+  vp9_writer header_bc;
 
-  uint8_t *cx_data = dest;
-  struct vp9_write_bit_buffer wb = {dest, 0};
-  struct vp9_write_bit_buffer first_partition_size_wb;
-
-  write_uncompressed_header(cpi, &wb);
-  first_partition_size_wb = wb;
-  vp9_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
-
-  bytes_packed = vp9_rb_bytes_written(&wb);
-  cx_data += bytes_packed;
+  vp9_start_encode(&header_bc, data);
 
-  compute_update_table();
-
-  vp9_start_encode(&header_bc, cx_data);
-
-#ifdef ENTROPY_STATS
-  if (pc->frame_type == INTER_FRAME)
-    active_section = 0;
+  if (xd->lossless)
+    cm->tx_mode = ONLY_4X4;
   else
-    active_section = 7;
-#endif
-
-  vp9_clear_system_state();  // __asm emms;
-
-  vp9_copy(pc->fc.pre_coef_probs, pc->fc.coef_probs);
-  vp9_copy(pc->fc.pre_y_mode_prob, pc->fc.y_mode_prob);
-  vp9_copy(pc->fc.pre_uv_mode_prob, pc->fc.uv_mode_prob);
-  vp9_copy(pc->fc.pre_partition_prob, pc->fc.partition_prob[INTER_FRAME]);
-  pc->fc.pre_nmvc = pc->fc.nmvc;
-  vp9_copy(pc->fc.pre_switchable_interp_prob, pc->fc.switchable_interp_prob);
-  vp9_copy(pc->fc.pre_inter_mode_probs, pc->fc.inter_mode_probs);
-  vp9_copy(pc->fc.pre_intra_inter_prob, pc->fc.intra_inter_prob);
-  vp9_copy(pc->fc.pre_comp_inter_prob, pc->fc.comp_inter_prob);
-  vp9_copy(pc->fc.pre_comp_ref_prob, pc->fc.comp_ref_prob);
-  vp9_copy(pc->fc.pre_single_ref_prob, pc->fc.single_ref_prob);
-  vp9_copy(pc->fc.pre_tx_probs_8x8p, pc->fc.tx_probs_8x8p);
-  vp9_copy(pc->fc.pre_tx_probs_16x16p, pc->fc.tx_probs_16x16p);
-  vp9_copy(pc->fc.pre_tx_probs_32x32p, pc->fc.tx_probs_32x32p);
-  vp9_copy(pc->fc.pre_mbskip_probs, pc->fc.mbskip_probs);
-
-  if (xd->lossless) {
-    pc->txfm_mode = ONLY_4X4;
-  } else {
     encode_txfm_probs(cpi, &header_bc);
-  }
 
   update_coef_probs(cpi, &header_bc);
 
@@ -1649,124 +1367,106 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
 
   vp9_update_skip_probs(cpi, &header_bc);
 
-  if (pc->frame_type != KEY_FRAME) {
+  if (cm->frame_type != KEY_FRAME) {
+    int i;
 #ifdef ENTROPY_STATS
     active_section = 1;
 #endif
 
-    update_inter_mode_probs(pc, &header_bc);
-    vp9_zero(cpi->common.fc.inter_mode_counts);
+    update_inter_mode_probs(cm, &header_bc);
+    vp9_zero(cm->counts.inter_mode);
 
-    if (pc->mcomp_filter_type == SWITCHABLE)
+    if (cm->mcomp_filter_type == SWITCHABLE)
       update_switchable_interp_probs(cpi, &header_bc);
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-      vp9_cond_prob_diff_update(&header_bc, &pc->fc.intra_inter_prob[i],
+      vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
                                 VP9_MODE_UPDATE_PROB,
                                 cpi->intra_inter_count[i]);
 
-    if (pc->allow_comp_inter_inter) {
+    if (cm->allow_comp_inter_inter) {
       const int comp_pred_mode = cpi->common.comp_pred_mode;
-      const int use_compound_pred = (comp_pred_mode != SINGLE_PREDICTION_ONLY);
-      const int use_hybrid_pred = (comp_pred_mode == HYBRID_PREDICTION);
+      const int use_compound_pred = comp_pred_mode != SINGLE_PREDICTION_ONLY;
+      const int use_hybrid_pred = comp_pred_mode == HYBRID_PREDICTION;
 
       vp9_write_bit(&header_bc, use_compound_pred);
       if (use_compound_pred) {
         vp9_write_bit(&header_bc, use_hybrid_pred);
-        if (use_hybrid_pred) {
+        if (use_hybrid_pred)
           for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-            vp9_cond_prob_diff_update(&header_bc, &pc->fc.comp_inter_prob[i],
+            vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
                                       VP9_MODE_UPDATE_PROB,
                                       cpi->comp_inter_count[i]);
-        }
       }
     }
 
-    if (pc->comp_pred_mode != COMP_PREDICTION_ONLY) {
+    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
       for (i = 0; i < REF_CONTEXTS; i++) {
-        vp9_cond_prob_diff_update(&header_bc, &pc->fc.single_ref_prob[i][0],
+        vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
                                   VP9_MODE_UPDATE_PROB,
                                   cpi->single_ref_count[i][0]);
-        vp9_cond_prob_diff_update(&header_bc, &pc->fc.single_ref_prob[i][1],
+        vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
                                   VP9_MODE_UPDATE_PROB,
                                   cpi->single_ref_count[i][1]);
       }
     }
 
-    if (pc->comp_pred_mode != SINGLE_PREDICTION_ONLY) {
+    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
       for (i = 0; i < REF_CONTEXTS; i++)
-        vp9_cond_prob_diff_update(&header_bc, &pc->fc.comp_ref_prob[i],
+        vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
                                   VP9_MODE_UPDATE_PROB,
                                   cpi->comp_ref_count[i]);
-    }
 
     update_mbintra_mode_probs(cpi, &header_bc);
 
     for (i = 0; i < NUM_PARTITION_CONTEXTS; ++i) {
-      vp9_prob Pnew[PARTITION_TYPES - 1];
+      vp9_prob pnew[PARTITION_TYPES - 1];
       unsigned int bct[PARTITION_TYPES - 1][2];
       update_mode(&header_bc, PARTITION_TYPES, vp9_partition_encodings,
-                  vp9_partition_tree, Pnew,
-                  pc->fc.partition_prob[pc->frame_type][i], bct,
+                  vp9_partition_tree, pnew,
+                  fc->partition_prob[cm->frame_type][i], bct,
                   (unsigned int *)cpi->partition_count[i]);
     }
 
     vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc);
   }
 
-
   vp9_stop_encode(&header_bc);
+  assert(header_bc.pos <= 0xffff);
 
+  return header_bc.pos;
+}
 
-  // first partition size
-  assert(header_bc.pos <= 0xffff);
-  vp9_wb_write_literal(&first_partition_size_wb, header_bc.pos, 16);
-  *size = bytes_packed + header_bc.pos;
-
-  {
-    int tile_row, tile_col, total_size = 0;
-    unsigned char *data_ptr = cx_data + header_bc.pos;
-    TOKENEXTRA *tok[4][1 << 6], *tok_end;
-
-    vpx_memset(cpi->common.above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
-               mi_cols_aligned_to_sb(&cpi->common));
-    tok[0][0] = cpi->tok;
-    for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
-      if (tile_row) {
-        tok[tile_row][0] = tok[tile_row - 1][pc->tile_columns - 1] +
-                           cpi->tok_count[tile_row - 1][pc->tile_columns - 1];
-      }
-      for (tile_col = 1; tile_col < pc->tile_columns; tile_col++) {
-        tok[tile_row][tile_col] = tok[tile_row][tile_col - 1] +
-                                  cpi->tok_count[tile_row][tile_col - 1];
-      }
-    }
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
+  uint8_t *data = dest;
+  size_t first_part_size;
+  struct vp9_write_bit_buffer wb = {data, 0};
+  struct vp9_write_bit_buffer saved_wb;
 
-    for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
-      vp9_get_tile_row_offsets(pc, tile_row);
-      for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) {
-        vp9_get_tile_col_offsets(pc, tile_col);
-        tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
-
-        if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1)
-          vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
-        else
-          vp9_start_encode(&residual_bc, data_ptr + total_size);
-        write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end);
-        assert(tok[tile_row][tile_col] == tok_end);
-        vp9_stop_encode(&residual_bc);
-        if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) {
-          // size of this tile
-          write_be32(data_ptr + total_size, residual_bc.pos);
-          total_size += 4;
-        }
+  write_uncompressed_header(cpi, &wb);
+  saved_wb = wb;
+  vp9_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
 
-        total_size += residual_bc.pos;
-      }
-    }
+  data += vp9_rb_bytes_written(&wb);
 
-    *size += total_size;
-  }
+  vp9_compute_update_table();
+
+#ifdef ENTROPY_STATS
+  if (pc->frame_type == INTER_FRAME)
+    active_section = 0;
+  else
+    active_section = 7;
+#endif
+
+  vp9_clear_system_state();  // __asm emms;
+
+  first_part_size = write_compressed_header(cpi, data);
+  data += first_part_size;
+  vp9_wb_write_literal(&saved_wb, first_part_size, 16);
+
+  data += encode_tiles(cpi, data);
+
+  *size = data - dest;
 }
 
 #ifdef ENTROPY_STATS
diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h
index 59cc3d9..4b49b17 100644
--- a/libvpx/vp9/encoder/vp9_block.h
+++ b/libvpx/vp9/encoder/vp9_block.h
@@ -24,11 +24,8 @@ typedef struct {
 } search_site;
 
 typedef struct {
-  int count;
   struct {
     MB_PREDICTION_MODE mode;
-    int_mv mv;
-    int_mv second_mv;
   } bmi[4];
 } PARTITION_INFO;
 
@@ -51,6 +48,7 @@ typedef struct {
   int comp_pred_diff;
   int single_pred_diff;
   int64_t txfm_rd_diff[NB_TXFM_MODES];
+  int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
 
   // Bit flag for each mode whether it has high error in comparison to others.
   unsigned int modes_with_high_error;
@@ -66,9 +64,8 @@ struct macroblock_plane {
 
   // Quantizer setings
   int16_t *quant;
-  uint8_t *quant_shift;
+  int16_t *quant_shift;
   int16_t *zbin;
-  int16_t *zrun_zbin_boost;
   int16_t *round;
 
   // Zbin Over Quant value
@@ -99,6 +96,7 @@ struct macroblock {
   signed int act_zbin_adj;
 
   int mv_best_ref_index[MAX_REF_FRAMES];
+  unsigned int max_mv_context[MAX_REF_FRAMES];
 
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
@@ -115,6 +113,7 @@ struct macroblock {
   int **mvsadcost;
 
   int mbmode_cost[MB_MODE_COUNT];
+  unsigned inter_mode_cost[INTER_MODE_CONTEXTS][MB_MODE_COUNT - NEARESTMV];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
   int y_mode_costs[VP9_INTRA_MODES][VP9_INTRA_MODES][VP9_INTRA_MODES];
   int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
@@ -134,13 +133,18 @@ struct macroblock {
   unsigned char *active_ptr;
 
   // note that token_costs is the cost when eob node is skipped
-  vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES];
-  vp9_coeff_count token_costs_noskip[TX_SIZE_MAX_SB][BLOCK_TYPES];
+  vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES][2];
 
   int optimize;
 
   // indicate if it is in the rd search loop or encoding process
   int rd_search;
+  int skip_encode;
+
+  // Used to store sub partition's choices.
+  int fast_ms;
+  int_mv pred_mv;
+  int subblock_ref;
 
   // TODO(jingning): Need to refactor the structure arrays that buffers the
   // coding mode decisions of each partition type.
diff --git a/libvpx/vp9/encoder/vp9_dct.c b/libvpx/vp9/encoder/vp9_dct.c
index a90bcf5..3112dad 100644
--- a/libvpx/vp9/encoder/vp9_dct.c
+++ b/libvpx/vp9/encoder/vp9_dct.c
@@ -587,7 +587,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
       temp_in[j] = out[j + i * 8];
     ht.rows(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      output[j + i * 8] = temp_out[j] >> 1;
+      output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
   }
 }
 
@@ -978,7 +978,8 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
       temp_in[j] = input[j * pitch + i] << 2;
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+//      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
 
   // Rows
@@ -1366,6 +1367,9 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
       temp_in[j] = input[j * shortpitch + i] << 2;
     dct32_1d(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
+      // TODO(cd): see quality impact of only doing
+      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
+      //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
 
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index 54b6e24..798adc1 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/encoder/vp9_encodeframe.h"
@@ -44,11 +43,8 @@
 int enc_debug = 0;
 #endif
 
-void vp9_select_interp_filter_type(VP9_COMP *cpi);
-
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int output_enabled, int mi_row, int mi_col,
-                              BLOCK_SIZE_TYPE bsize);
+static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+                              int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize);
 
 static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
 
@@ -64,10 +60,8 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
  * Eventually this should be replaced by custom no-reference routines,
  *  which will be faster.
  */
-static const uint8_t VP9_VAR_OFFS[16] = {
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
-};
-
+static const uint8_t VP9_VAR_OFFS[16] = {128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128};
 
 // Original activity measure from Tim T's code.
 static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
@@ -92,13 +86,11 @@ static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
 }
 
 // Stub for alternative experimental activity measures.
-static unsigned int alt_activity_measure(VP9_COMP *cpi,
-                                         MACROBLOCK *x, int use_dc_pred) {
+static unsigned int alt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,
+                                         int use_dc_pred) {
   return vp9_encode_intra(cpi, x, use_dc_pred);
 }
-
-DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = { 0 };
-
+DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = {0};
 
 // Measure the activity of the current macroblock
 // What we measure here is TBD so abstracted to this function
@@ -135,14 +127,12 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
     unsigned int tmp;
 
     // Create a list to sort to
-    CHECK_MEM_ERROR(sortlist,
-    vpx_calloc(sizeof(unsigned int),
-    cpi->common.MBs));
+    CHECK_MEM_ERROR(&cpi->common, sortlist, vpx_calloc(sizeof(unsigned int),
+                    cpi->common.MBs));
 
     // Copy map to sort list
     vpx_memcpy(sortlist, cpi->mb_activity_map,
-    sizeof(unsigned int) * cpi->common.MBs);
-
+        sizeof(unsigned int) * cpi->common.MBs);
 
     // Ripple each value down to its correct position
     for (i = 1; i < cpi->common.MBs; i ++) {
@@ -153,13 +143,13 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
           sortlist[j - 1] = sortlist[j];
           sortlist[j] = tmp;
         } else
-          break;
+        break;
       }
     }
 
     // Even number MBs so estimate median as mean of two either side.
     median = (1 + sortlist[cpi->common.MBs >> 1] +
-              sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;
+        sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;
 
     cpi->activity_avg = median;
 
@@ -167,7 +157,7 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
   }
 #else
   // Simple mean for now
-  cpi->activity_avg = (unsigned int)(activity_sum / cpi->common.MBs);
+  cpi->activity_avg = (unsigned int) (activity_sum / cpi->common.MBs);
 #endif
 
   if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN)
@@ -211,9 +201,9 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
       b = 4 * act + cpi->activity_avg;
 
       if (b >= a)
-        *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;
+      *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;
       else
-        *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);
+      *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);
 
 #if OUTPUT_NORM_ACT_STATS
       fprintf(f, " %6d", *(x->mb_activity_ptr));
@@ -238,9 +228,9 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
 // Loop through all MBs. Note activity of each, average activity and
 // calculate a normalized activity for each
 static void build_activity_map(VP9_COMP *cpi) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD *xd = &x->e_mbd;
-  VP9_COMMON *const cm = &cpi->common;
+  VP9_COMMON * const cm = &cpi->common;
 
 #if ALT_ACT_MEASURE
   YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
@@ -285,7 +275,6 @@ static void build_activity_map(VP9_COMP *cpi) {
       x->plane[0].src.buf += 16;
     }
 
-
     // adjust to the next row of mbs
     x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
   }
@@ -315,7 +304,7 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
   a = act + (2 * cpi->activity_avg);
   b = (2 * act) + cpi->activity_avg;
 
-  x->rdmult = (unsigned int)(((int64_t)x->rdmult * b + (a >> 1)) / a);
+  x->rdmult = (unsigned int) (((int64_t) x->rdmult * b + (a >> 1)) / a);
   x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
   x->errorperbit += (x->errorperbit == 0);
 #endif
@@ -324,41 +313,38 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
   adjust_act_zbin(cpi, x);
 }
 
-static void update_state(VP9_COMP *cpi,
-                         PICK_MODE_CONTEXT *ctx,
-                         BLOCK_SIZE_TYPE bsize,
-                         int output_enabled) {
+static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+                         BLOCK_SIZE_TYPE bsize, int output_enabled) {
   int i, x_idx, y;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-#if CONFIG_DEBUG || CONFIG_INTERNAL_STATS
-  MB_PREDICTION_MODE mb_mode = mi->mbmi.mode;
-#endif
+  MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
+
   int mb_mode_index = ctx->best_mode_index;
   const int mis = cpi->common.mode_info_stride;
-  const int bh = 1 << mi_height_log2(bsize), bw = 1 << mi_width_log2(bsize);
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
 
-#if CONFIG_DEBUG
-  assert(mb_mode < MB_MODE_COUNT);
+  assert(mi->mbmi.mode < MB_MODE_COUNT);
   assert(mb_mode_index < MAX_MODES);
   assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);
   assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
-#endif
-
   assert(mi->mbmi.sb_type == bsize);
+
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
-  for (y = 0; y < bh; y++) {
-    for (x_idx = 0; x_idx < bw; x_idx++) {
-      if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > x_idx &&
-          (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > y) {
+  for (y = 0; y < mi_height; y++) {
+    for (x_idx = 0; x_idx < mi_width; x_idx++) {
+      if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > x_idx
+          && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > y) {
         MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis;
         *mi_addr = *mi;
       }
     }
   }
+  // FIXME(rbultje) I'm pretty sure this should go to the end of this block
+  // (i.e. after the output_enabled)
   if (bsize < BLOCK_SIZE_SB32X32) {
     if (bsize < BLOCK_SIZE_MB16X16)
       ctx->txfm_rd_diff[ALLOW_16X16] = ctx->txfm_rd_diff[ALLOW_8X8];
@@ -367,15 +353,15 @@ static void update_state(VP9_COMP *cpi,
 
   if (mbmi->ref_frame[0] != INTRA_FRAME && mbmi->sb_type < BLOCK_SIZE_SB8X8) {
     *x->partition_info = ctx->partition_info;
-    mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int;
-    mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int;
+    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   }
 
   x->skip = ctx->skip;
   if (!output_enabled)
     return;
 
-  if (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
+  if (!vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     for (i = 0; i < NB_TXFM_MODES; i++) {
       cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];
     }
@@ -404,31 +390,13 @@ static void update_state(VP9_COMP *cpi,
       THR_TM /*TM_PRED*/,
       THR_B_PRED /*I4X4_PRED*/,
     };
-    cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
+    cpi->mode_chosen_counts[kf_mode_index[mi->mbmi.mode]]++;
 #endif
   } else {
-    /*
-            // Reduce the activation RD thresholds for the best choice mode
-            if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&
-                (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))
-            {
-                int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);
-
-                cpi->rd_thresh_mult[mb_mode_index] =
-                        (cpi->rd_thresh_mult[mb_mode_index]
-                         >= (MIN_THRESHMULT + best_adjustment)) ?
-                                cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :
-                                MIN_THRESHMULT;
-                cpi->rd_threshes[mb_mode_index] =
-                        (cpi->rd_baseline_thresh[mb_mode_index] >> 7)
-                        * cpi->rd_thresh_mult[mb_mode_index];
-
-            }
-    */
     // Note how often each mode chosen as best
     cpi->mode_chosen_counts[mb_mode_index]++;
-    if (mbmi->ref_frame[0] != INTRA_FRAME &&
-        (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {
+    if (mbmi->ref_frame[0] != INTRA_FRAME
+        && (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {
       int_mv best_mv, best_second_mv;
       const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
@@ -445,72 +413,55 @@ static void update_state(VP9_COMP *cpi,
 
     if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) {
       int i, j;
-      for (j = 0; j < bh; ++j)
-        for (i = 0; i < bw; ++i)
-          if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > i &&
-              (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > j)
+      for (j = 0; j < mi_height; ++j)
+        for (i = 0; i < mi_width; ++i)
+          if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > i
+              && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > j)
             xd->mode_info_context[mis * j + i].mbmi = *mbmi;
     }
 
-    if (cpi->common.mcomp_filter_type == SWITCHABLE &&
-        is_inter_mode(mbmi->mode)) {
-      ++cpi->common.fc.switchable_interp_count
-          [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
-          [vp9_switchable_interp_map[mbmi->interp_filter]];
+    if (cpi->common.mcomp_filter_type == SWITCHABLE
+        && is_inter_mode(mbmi->mode)) {
+      ++cpi->common.counts.switchable_interp[
+          vp9_get_pred_context_switchable_interp(xd)]
+            [vp9_switchable_interp_map[mbmi->interp_filter]];
     }
 
     cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
-    cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY]   += ctx->comp_pred_diff;
-    cpi->rd_comp_pred_diff[HYBRID_PREDICTION]      += ctx->hybrid_pred_diff;
-  }
-}
-
-static unsigned find_seg_id(VP9_COMMON *cm, uint8_t *buf, BLOCK_SIZE_TYPE bsize,
-                            int start_y, int height, int start_x, int width) {
-  const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
-  const int end_x = MIN(start_x + bw, width);
-  const int end_y = MIN(start_y + bh, height);
-  int x, y;
-  unsigned seg_id = -1;
+    cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
+    cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
 
-  buf += width * start_y;
-  assert(start_y < cm->mi_rows && start_x < cm->cur_tile_mi_col_end);
-  for (y = start_y; y < end_y; y++, buf += width) {
-    for (x = start_x; x < end_x; x++) {
-      seg_id = MIN(seg_id, buf[x]);
+    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+      cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
     }
   }
-
-  return seg_id;
 }
 
-void vp9_setup_src_planes(MACROBLOCK *x,
-                          const YV12_BUFFER_CONFIG *src,
+void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           int mb_row, int mb_col) {
-  uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                         src->alpha_buffer};
-  int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                    src->alpha_stride};
+  uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, src
+      ->alpha_buffer};
+  int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, src
+      ->alpha_stride};
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
-    setup_pred_plane(&x->plane[i].src,
-                     buffers[i], strides[i],
-                     mb_row, mb_col, NULL,
-                     x->e_mbd.plane[i].subsampling_x,
+    setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mb_row, mb_col,
+                     NULL, x->e_mbd.plane[i].subsampling_x,
                      x->e_mbd.plane[i].subsampling_y);
   }
 }
 
-static void set_offsets(VP9_COMP *cpi,
-                        int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCK *const x = &cpi->mb;
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
+static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
+                        BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCK * const x = &cpi->mb;
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCKD * const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
   const int dst_fb_idx = cm->new_fb_idx;
   const int idx_str = xd->mode_info_stride * mi_row + mi_col;
-  const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mb_row = mi_row >> 1;
   const int mb_col = mi_col >> 1;
   const int idx_map = mb_row * cm->mb_cols + mb_col;
@@ -518,10 +469,10 @@ static void set_offsets(VP9_COMP *cpi,
 
   // entropy context structures
   for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].above_context = cm->above_context[i] +
-        (mi_col * 2 >>  xd->plane[i].subsampling_x);
-    xd->plane[i].left_context = cm->left_context[i] +
-        (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
+    xd->plane[i].above_context = cm->above_context[i]
+        + (mi_col * 2 >> xd->plane[i].subsampling_x);
+    xd->plane[i].left_context = cm->left_context[i]
+        + (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
   }
 
   // partition contexts
@@ -532,29 +483,28 @@ static void set_offsets(VP9_COMP *cpi,
   x->active_ptr = cpi->active_map + idx_map;
 
   /* pointers to mode info contexts */
-  x->partition_info          = x->pi + idx_str;
-  xd->mode_info_context      = cm->mi + idx_str;
+  x->partition_info = x->pi + idx_str;
+  xd->mode_info_context = cm->mi + idx_str;
   mbmi = &xd->mode_info_context->mbmi;
   // Special case: if prev_mi is NULL, the previous mode info context
   // cannot be used.
-  xd->prev_mode_info_context = cm->prev_mi ?
-                                 cm->prev_mi + idx_str : NULL;
+  xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + idx_str : NULL;
 
   // Set up destination pointers
   setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col);
 
   /* Set up limit values for MV components to prevent them from
    * extending beyond the UMV borders assuming 16x16 block size */
-  x->mv_row_min = -((mi_row * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-  x->mv_col_min = -((mi_col * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-  x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE +
-                   (VP9BORDERINPIXELS - MI_SIZE * bh - VP9_INTERP_EXTEND));
-  x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE +
-                   (VP9BORDERINPIXELS - MI_SIZE * bw - VP9_INTERP_EXTEND));
+  x->mv_row_min = -((mi_row * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+  x->mv_col_min = -((mi_col * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+  x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE
+      + (VP9BORDERINPIXELS - MI_SIZE * mi_height - VP9_INTERP_EXTEND));
+  x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE
+      + (VP9BORDERINPIXELS - MI_SIZE * mi_width - VP9_INTERP_EXTEND));
 
   // Set up distance of MB to edge of frame in 1/8th pel units
-  assert(!(mi_col & (bw - 1)) && !(mi_row & (bh - 1)));
-  set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(cm, xd, mi_row, mi_height, mi_col, mi_width);
 
   /* set up source buffers */
   vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
@@ -564,31 +514,28 @@ static void set_offsets(VP9_COMP *cpi,
   x->rdmult = cpi->RDMULT;
 
   /* segment ID */
-  if (xd->segmentation_enabled) {
-    uint8_t *map = xd->update_mb_segmentation_map ? cpi->segmentation_map
-                                                  : cm->last_frame_seg_map;
-    mbmi->segment_id = find_seg_id(cm, map, bsize, mi_row,
-                                   cm->mi_rows, mi_col, cm->mi_cols);
+  if (xd->seg.enabled) {
+    uint8_t *map = xd->seg.update_map ? cpi->segmentation_map
+                                      : cm->last_frame_seg_map;
+    mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
 
-    assert(mbmi->segment_id <= (MAX_MB_SEGMENTS-1));
     vp9_mb_init_quantizer(cpi, x);
 
-    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
-        !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
-        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
+    if (xd->seg.enabled && cpi->seg0_cnt > 0
+        && !vp9_segfeature_active(&xd->seg, 0, SEG_LVL_REF_FRAME)
+        && vp9_segfeature_active(&xd->seg, 1, SEG_LVL_REF_FRAME)) {
       cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
     } else {
       const int y = mb_row & ~3;
       const int x = mb_col & ~3;
-      const int p16 = ((mb_row & 1) << 1) +  (mb_col & 1);
+      const int p16 = ((mb_row & 1) << 1) + (mb_col & 1);
       const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
-      const int tile_progress =
-          cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
-      const int mb_cols =
-          (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> 1;
+      const int tile_progress = cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
+      const int mb_cols = (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start)
+          >> 1;
 
-      cpi->seg0_progress =
-          ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs;
+      cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress)
+          << 16) / cm->MBs;
     }
   } else {
     mbmi->segment_id = 0;
@@ -596,8 +543,9 @@ static void set_offsets(VP9_COMP *cpi,
 }
 
 static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
-                          TOKENEXTRA **tp, int *totalrate, int *totaldist,
-                          BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) {
+                          int *totalrate, int64_t *totaldist,
+                          BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx,
+                          int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -613,53 +561,48 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
     vp9_activity_masking(cpi, x);
 
-  /* Find best coding mode & reconstruct the MB so it is available
-   * as a predictor for MBs that follow in the SB */
-  if (cm->frame_type == KEY_FRAME) {
-    vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx);
-  } else {
+  // Find best coding mode & reconstruct the MB so it is available
+  // as a predictor for MBs that follow in the SB
+  if (cm->frame_type == KEY_FRAME)
+    vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx,
+                              best_rd);
+  else
     vp9_rd_pick_inter_mode_sb(cpi, x, mi_row, mi_col, totalrate, totaldist,
-                              bsize, ctx);
-  }
+                              bsize, ctx, best_rd);
 }
 
 static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  MB_MODE_INFO * const mbmi = &mi->mbmi;
 
   if (cm->frame_type != KEY_FRAME) {
-    int segment_id, seg_ref_active;
-
-    segment_id = mbmi->segment_id;
-    seg_ref_active = vp9_segfeature_active(xd, segment_id,
-                                           SEG_LVL_REF_FRAME);
+    const int seg_ref_active = vp9_segfeature_active(&xd->seg, mbmi->segment_id,
+                                                     SEG_LVL_REF_FRAME);
 
     if (!seg_ref_active)
-      cpi->intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)]
-                            [mbmi->ref_frame[0] > INTRA_FRAME]++;
+      cpi->intra_inter_count[vp9_get_pred_context_intra_inter(xd)][mbmi
+          ->ref_frame[0] > INTRA_FRAME]++;
 
     // If the segment reference feature is enabled we have only a single
     // reference frame allowed for the segment so exclude it from
     // the reference frame counts used to work out probabilities.
     if ((mbmi->ref_frame[0] > INTRA_FRAME) && !seg_ref_active) {
       if (cm->comp_pred_mode == HYBRID_PREDICTION)
-        cpi->comp_inter_count[vp9_get_pred_context(cm, xd,
-                                                   PRED_COMP_INTER_INTER)]
-                             [mbmi->ref_frame[1] > INTRA_FRAME]++;
+        cpi->comp_inter_count[vp9_get_pred_context_comp_inter_inter(cm, xd)]
+                              [mbmi->ref_frame[1] > INTRA_FRAME]++;
 
       if (mbmi->ref_frame[1] > INTRA_FRAME) {
-        cpi->comp_ref_count[vp9_get_pred_context(cm, xd, PRED_COMP_REF_P)]
-                           [mbmi->ref_frame[0] == GOLDEN_FRAME]++;
+        cpi->comp_ref_count[vp9_get_pred_context_comp_ref_p(cm, xd)][mbmi
+            ->ref_frame[0] == GOLDEN_FRAME]++;
       } else {
-        cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1)]
-                             [0][mbmi->ref_frame[0] != LAST_FRAME]++;
+        cpi->single_ref_count[vp9_get_pred_context_single_ref_p1(xd)]
+                              [0][mbmi->ref_frame[0] != LAST_FRAME]++;
         if (mbmi->ref_frame[0] != LAST_FRAME)
-          cpi->single_ref_count[vp9_get_pred_context(cm, xd,
-                                                     PRED_SINGLE_REF_P2)]
-                               [1][mbmi->ref_frame[0] != GOLDEN_FRAME]++;
+          cpi->single_ref_count[vp9_get_pred_context_single_ref_p2(xd)][1]
+              [mbmi->ref_frame[0] != GOLDEN_FRAME]++;
       }
     }
     // Count of last ref frame 0,0 usage
@@ -673,7 +616,7 @@ static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {
 // partition down to 4x4 block size is enabled.
 static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
                                             BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCKD * const xd = &x->e_mbd;
 
   switch (bsize) {
     case BLOCK_SIZE_SB64X64:
@@ -704,7 +647,7 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
       return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
     default:
       assert(0);
-      return NULL;
+      return NULL ;
   }
 }
 
@@ -722,75 +665,80 @@ static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
       return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
     default:
       assert(0);
-      return NULL;
+      return NULL ;
   }
 }
 
 static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
                             ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                             ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
-                            PARTITION_CONTEXT sa[8],
-                            PARTITION_CONTEXT sl[8],
+                            PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
                             BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   int p;
-  int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  int bhl = b_height_log2(bsize), bh = 1 << bhl;
-  int mwl = mi_width_log2(bsize), mw = 1 << mwl;
-  int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int mi_height = num_8x8_blocks_high_lookup[bsize];
   for (p = 0; p < MAX_MB_PLANE; p++) {
-    vpx_memcpy(cm->above_context[p] +
-               ((mi_col * 2) >> xd->plane[p].subsampling_x),
-               a + bw * p,
-               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(cm->left_context[p] +
-               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
-               l + bh * p,
-               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+    vpx_memcpy(
+        cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+        a + num_4x4_blocks_wide * p,
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+        xd->plane[p].subsampling_x);
+    vpx_memcpy(
+        cm->left_context[p]
+            + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+        l + num_4x4_blocks_high * p,
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+        xd->plane[p].subsampling_y);
   }
   vpx_memcpy(cm->above_seg_context + mi_col, sa,
-             sizeof(PARTITION_CONTEXT) * mw);
+             sizeof(PARTITION_CONTEXT) * mi_width);
   vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
-             sizeof(PARTITION_CONTEXT) * mh);
+             sizeof(PARTITION_CONTEXT) * mi_height);
 }
 static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
-                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
-                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
-                          PARTITION_CONTEXT sa[8],
-                          PARTITION_CONTEXT sl[8],
-                          BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+                         ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                         ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                         PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+                         BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   int p;
-  int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  int bhl = b_height_log2(bsize), bh = 1 << bhl;
-  int mwl = mi_width_log2(bsize), mw = 1 << mwl;
-  int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int mi_height = num_8x8_blocks_high_lookup[bsize];
 
   // buffer the above/left context information of the block in search.
   for (p = 0; p < MAX_MB_PLANE; ++p) {
-    vpx_memcpy(a + bw * p, cm->above_context[p] +
-               (mi_col * 2 >> xd->plane[p].subsampling_x),
-               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(l + bh * p, cm->left_context[p] +
-               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
-               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+    vpx_memcpy(
+        a + num_4x4_blocks_wide * p,
+        cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+        xd->plane[p].subsampling_x);
+    vpx_memcpy(
+        l + num_4x4_blocks_high * p,
+        cm->left_context[p]
+            + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+        xd->plane[p].subsampling_y);
   }
   vpx_memcpy(sa, cm->above_seg_context + mi_col,
-             sizeof(PARTITION_CONTEXT) * mw);
+             sizeof(PARTITION_CONTEXT) * mi_width);
   vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
-             sizeof(PARTITION_CONTEXT) * mh);
+             sizeof(PARTITION_CONTEXT) * mi_height);
 }
 
-static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
-                     int mi_row, int mi_col, int output_enabled,
-                     BLOCK_SIZE_TYPE bsize, int sub_index) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+                     int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -813,16 +761,17 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
   }
 }
 
-static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
-                      int mi_row, int mi_col, int output_enabled,
-                      BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+                      int output_enabled, BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
   const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
-  int bwl, bhl;
   int UNINITIALIZED_IS_SAFE(pl);
+  PARTITION_TYPE partition;
+  BLOCK_SIZE_TYPE subsize;
+  int i;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -833,44 +782,46 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
     pl = partition_plane_context(xd, bsize);
     c1 = *(get_sb_partitioning(x, bsize));
   }
+  partition = partition_lookup[bsl][c1];
 
-  bwl = b_width_log2(c1), bhl = b_height_log2(c1);
-
-  if (bsl == bwl && bsl == bhl) {
-    if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)
+  switch (partition) {
+    case PARTITION_NONE:
+      if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)
         cpi->partition_count[pl][PARTITION_NONE]++;
-    encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
-  } else if (bsl == bhl && bsl > bwl) {
-    if (output_enabled)
-      cpi->partition_count[pl][PARTITION_VERT]++;
-    encode_b(cpi, tp, mi_row, mi_col,      output_enabled, c1, 0);
-    encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
-  } else if (bsl == bwl && bsl > bhl) {
-    if (output_enabled)
-      cpi->partition_count[pl][PARTITION_HORZ]++;
-    encode_b(cpi, tp, mi_row,      mi_col, output_enabled, c1, 0);
-    encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
-  } else {
-    BLOCK_SIZE_TYPE subsize;
-    int i;
-
-    assert(bwl < bsl && bhl < bsl);
-    subsize = get_subsize(bsize, PARTITION_SPLIT);
+      encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
+      break;
+    case PARTITION_VERT:
+      if (output_enabled)
+        cpi->partition_count[pl][PARTITION_VERT]++;
+      encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
+      encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
+      break;
+    case PARTITION_HORZ:
+      if (output_enabled)
+        cpi->partition_count[pl][PARTITION_HORZ]++;
+      encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
+      encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
+      break;
+    case PARTITION_SPLIT:
+      subsize = get_subsize(bsize, PARTITION_SPLIT);
 
-    if (output_enabled)
-      cpi->partition_count[pl][PARTITION_SPLIT]++;
+      if (output_enabled)
+        cpi->partition_count[pl][PARTITION_SPLIT]++;
 
-    for (i = 0; i < 4; i++) {
-      const int x_idx = i & 1, y_idx = i >> 1;
+      for (i = 0; i < 4; i++) {
+        const int x_idx = i & 1, y_idx = i >> 1;
 
-      *(get_sb_index(xd, subsize)) = i;
-      encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
-                output_enabled, subsize);
-    }
+        *(get_sb_index(xd, subsize)) = i;
+        encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
+                  output_enabled, subsize);
+      }
+      break;
+    default:
+      assert(0);
+      break;
   }
 
-  if (bsize >= BLOCK_SIZE_SB8X8 &&
-      (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) {
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_SIZE_SB8X8) {
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     update_partition_context(xd, c1, bsize);
   }
@@ -880,26 +831,28 @@ static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
                              BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
-  int bsl = b_width_log2(bsize);
-  int bs = (1 << bsl) / 2;  //
   int block_row, block_col;
-  int row, col;
-
-  // this test function sets the entire macroblock to the same bsize
-  for (block_row = 0; block_row < 8; block_row += bs) {
-    for (block_col = 0; block_col < 8; block_col += bs) {
-      for (row = 0; row < bs; row++) {
-        for (col = 0; col < bs; col++) {
-          m[(block_row+row)*mis + block_col+col].mbmi.sb_type = bsize;
-        }
-      }
+  for (block_row = 0; block_row < 8; ++block_row) {
+    for (block_col = 0; block_col < 8; ++block_col) {
+      m[block_row * mis + block_col].mbmi.sb_type = bsize;
+    }
+  }
+}
+static void copy_partitioning(VP9_COMP *cpi, MODE_INFO *m, MODE_INFO *p) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int mis = cm->mode_info_stride;
+  int block_row, block_col;
+  for (block_row = 0; block_row < 8; ++block_row) {
+    for (block_col = 0; block_col < 8; ++block_col) {
+      m[block_row * mis + block_col].mbmi.sb_type =
+          p[block_row * mis + block_col].mbmi.sb_type;
     }
   }
 }
 
-static void set_block_size(VP9_COMMON *const cm,
-                           MODE_INFO *m, BLOCK_SIZE_TYPE bsize, int mis,
-                           int mi_row, int mi_col) {
+static void set_block_size(VP9_COMMON * const cm, MODE_INFO *m,
+                           BLOCK_SIZE_TYPE bsize, int mis, int mi_row,
+                           int mi_col) {
   int row, col;
   int bwl = b_width_log2(bsize);
   int bhl = b_height_log2(bsize);
@@ -911,10 +864,11 @@ static void set_block_size(VP9_COMMON *const cm,
     for (col = 0; col < bs; col++) {
       if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols)
         continue;
-      m2[row*mis+col].mbmi.sb_type = bsize;
+      m2[row * mis + col].mbmi.sb_type = bsize;
     }
   }
 }
+
 typedef struct {
   int64_t sum_square_error;
   int64_t sum_error;
@@ -922,11 +876,15 @@ typedef struct {
   int variance;
 } var;
 
+typedef struct {
+  var none;
+  var horz[2];
+  var vert[2];
+} partition_variance;
+
 #define VT(TYPE, BLOCKSIZE) \
   typedef struct { \
-    var none; \
-    var horz[2]; \
-    var vert[2]; \
+    partition_variance vt; \
     BLOCKSIZE split[4]; } TYPE;
 
 VT(v8x8, var)
@@ -934,20 +892,67 @@ VT(v16x16, v8x8)
 VT(v32x32, v16x16)
 VT(v64x64, v32x32)
 
+typedef struct {
+  partition_variance *vt;
+  var *split[4];
+} vt_node;
+
 typedef enum {
   V16X16,
   V32X32,
   V64X64,
 } TREE_LEVEL;
 
+static void tree_to_node(void *data, BLOCK_SIZE_TYPE block_size, vt_node *node) {
+  int i;
+  switch (block_size) {
+    case BLOCK_SIZE_SB64X64: {
+      v64x64 *vt = (v64x64 *) data;
+      node->vt = &vt->vt;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].vt.none;
+      break;
+    }
+    case BLOCK_SIZE_SB32X32: {
+      v32x32 *vt = (v32x32 *) data;
+      node->vt = &vt->vt;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].vt.none;
+      break;
+    }
+    case BLOCK_SIZE_MB16X16: {
+      v16x16 *vt = (v16x16 *) data;
+      node->vt = &vt->vt;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].vt.none;
+      break;
+    }
+    case BLOCK_SIZE_SB8X8: {
+      v8x8 *vt = (v8x8 *) data;
+      node->vt = &vt->vt;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i];
+      break;
+    }
+    default:
+      node->vt = 0;
+      for (i = 0; i < 4; i++)
+        node->split[i] = 0;
+      assert(-1);
+  }
+}
+
 // Set variance values given sum square error, sum error, count.
 static void fill_variance(var *v, int64_t s2, int64_t s, int c) {
   v->sum_square_error = s2;
   v->sum_error = s;
   v->count = c;
-  v->variance = 256
-      * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
-      / v->count;
+  if (c > 0)
+    v->variance = 256
+        * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
+        / v->count;
+  else
+    v->variance = 0;
 }
 
 // Combine 2 variance structures by summing the sum_error, sum_square_error,
@@ -956,31 +961,95 @@ void sum_2_variances(var *r, var *a, var*b) {
   fill_variance(r, a->sum_square_error + b->sum_square_error,
                 a->sum_error + b->sum_error, a->count + b->count);
 }
-// Fill one level of our variance tree,  by summing the split sums into each of
-// the horizontal, vertical and none from split and recalculating variance.
-#define fill_variance_tree(VT) \
-  sum_2_variances(VT.horz[0], VT.split[0].none, VT.split[1].none); \
-  sum_2_variances(VT.horz[1], VT.split[2].none, VT.split[3].none); \
-  sum_2_variances(VT.vert[0], VT.split[0].none, VT.split[2].none); \
-  sum_2_variances(VT.vert[1], VT.split[1].none, VT.split[3].none); \
-  sum_2_variances(VT.none, VT.vert[0], VT.vert[1]);
-
-// Set the blocksize in the macroblock info structure if the variance is less
-// than our threshold to one of none, horz, vert.
-#define set_vt_size(VT, BLOCKSIZE, R, C, ACTION) \
-  if (VT.none.variance < threshold) { \
-    set_block_size(cm, m, BLOCKSIZE, mis, R, C); \
-    ACTION; \
-  } \
-  if (VT.horz[0].variance < threshold && VT.horz[1].variance < threshold ) { \
-    set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_HORZ), mis, R, C); \
-    ACTION; \
-  } \
-  if (VT.vert[0].variance < threshold && VT.vert[1].variance < threshold ) { \
-    set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_VERT), mis, R, C); \
-    ACTION; \
+
+static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) {
+  vt_node node;
+  tree_to_node(data, block_size, &node);
+  sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]);
+  sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]);
+  sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]);
+  sum_2_variances(&node.vt->vert[1], node.split[1], node.split[3]);
+  sum_2_variances(&node.vt->none, &node.vt->vert[0], &node.vt->vert[1]);
+}
+
+#if PERFORM_RANDOM_PARTITIONING
+static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
+    BLOCK_SIZE_TYPE block_size, int mi_row,
+    int mi_col, int mi_size) {
+  VP9_COMMON * const cm = &cpi->common;
+  vt_node vt;
+  const int mis = cm->mode_info_stride;
+  int64_t threshold = 4 * cpi->common.base_qindex * cpi->common.base_qindex;
+
+  tree_to_node(data, block_size, &vt);
+
+  // split none is available only if we have more than half a block size
+  // in width and height inside the visible image
+  if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows &&
+      (rand() & 3) < 1) {
+    set_block_size(cm, m, block_size, mis, mi_row, mi_col);
+    return 1;
+  }
+
+  // vertical split is available on all but the bottom border
+  if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
+      && (rand() & 3) < 1) {
+    set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
+        mi_col);
+    return 1;
   }
 
+  // horizontal split is available on all but the right border
+  if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
+      && (rand() & 3) < 1) {
+    set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
+        mi_col);
+    return 1;
+  }
+
+  return 0;
+}
+
+#else
+
+static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
+                               BLOCK_SIZE_TYPE block_size, int mi_row,
+                               int mi_col, int mi_size) {
+  VP9_COMMON * const cm = &cpi->common;
+  vt_node vt;
+  const int mis = cm->mode_info_stride;
+  int64_t threshold = 50 * cpi->common.base_qindex;
+
+  tree_to_node(data, block_size, &vt);
+
+  // split none is available only if we have more than half a block size
+  // in width and height inside the visible image
+  if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows
+      && vt.vt->none.variance < threshold) {
+    set_block_size(cm, m, block_size, mis, mi_row, mi_col);
+    return 1;
+  }
+
+  // vertical split is available on all but the bottom border
+  if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
+      && vt.vt->vert[1].variance < threshold) {
+    set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
+                   mi_col);
+    return 1;
+  }
+
+  // horizontal split is available on all but the right border
+  if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
+      && vt.vt->horz[1].variance < threshold) {
+    set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
+                   mi_col);
+    return 1;
+  }
+
+  return 0;
+}
+#endif
+
 static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
                                 int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
@@ -993,8 +1062,8 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
   v64x64 vt;
   unsigned char * s;
   int sp;
-  const unsigned char * d = xd->plane[0].pre->buf;
-  int dp = xd->plane[0].pre->stride;
+  const unsigned char * d;
+  int dp;
   int pixels_wide = 64, pixels_high = 64;
 
   vpx_memset(&vt, 0, sizeof(vt));
@@ -1014,179 +1083,228 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
   // but this needs more experimentation.
   threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex;
 
-  // if ( cm->frame_type == KEY_FRAME ) {
   d = vp9_64x64_zeros;
   dp = 64;
-  // }
+  if (cm->frame_type != KEY_FRAME) {
+    int_mv nearest_mv, near_mv;
+    YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[0];
+    YV12_BUFFER_CONFIG *second_ref_fb = NULL;
+
+    setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col,
+                     &xd->scale_factor[0]);
+    setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
+                     &xd->scale_factor[1]);
+    xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME;
+    xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64;
+    vp9_find_best_ref_mvs(xd, m->mbmi.ref_mvs[m->mbmi.ref_frame[0]],
+                          &nearest_mv, &near_mv);
+
+    xd->mode_info_context->mbmi.mv[0] = nearest_mv;
+    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+    d = xd->plane[0].dst.buf;
+    dp = xd->plane[0].dst.stride;
+
+  }
 
   // Fill in the entire tree of 8x8 variances for splits.
   for (i = 0; i < 4; i++) {
     const int x32_idx = ((i & 1) << 5);
     const int y32_idx = ((i >> 1) << 5);
     for (j = 0; j < 4; j++) {
-      const int x_idx = x32_idx + ((j & 1) << 4);
-      const int y_idx = y32_idx + ((j >> 1) << 4);
-      const uint8_t *st = s + y_idx * sp + x_idx;
-      const uint8_t *dt = d + y_idx * dp + x_idx;
-      unsigned int sse = 0;
-      int sum = 0;
+      const int x16_idx = x32_idx + ((j & 1) << 4);
+      const int y16_idx = y32_idx + ((j >> 1) << 4);
       v16x16 *vst = &vt.split[i].split[j];
-      sse = sum = 0;
-      if (x_idx < pixels_wide && y_idx < pixels_high)
-        vp9_get_sse_sum_8x8(st, sp, dt, dp, &sse, &sum);
-      fill_variance(&vst->split[0].none, sse, sum, 64);
-      sse = sum = 0;
-      if (x_idx + 8 < pixels_wide && y_idx < pixels_high)
-        vp9_get_sse_sum_8x8(st + 8, sp, dt + 8, dp, &sse, &sum);
-      fill_variance(&vst->split[1].none, sse, sum, 64);
-      sse = sum = 0;
-      if (x_idx < pixels_wide && y_idx + 8 < pixels_high)
-        vp9_get_sse_sum_8x8(st + 8 * sp, sp, dt + 8 * dp, dp, &sse, &sum);
-      fill_variance(&vst->split[2].none, sse, sum, 64);
-      sse = sum = 0;
-      if (x_idx + 8 < pixels_wide && y_idx + 8 < pixels_high)
-        vp9_get_sse_sum_8x8(st + 8 * sp + 8, sp, dt + 8 + 8 * dp, dp, &sse,
-                            &sum);
-      fill_variance(&vst->split[3].none, sse, sum, 64);
+      for (k = 0; k < 4; k++) {
+        int x_idx = x16_idx + ((k & 1) << 3);
+        int y_idx = y16_idx + ((k >> 1) << 3);
+        unsigned int sse = 0;
+        int sum = 0;
+        if (x_idx < pixels_wide && y_idx < pixels_high)
+          vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
+                              d + y_idx * dp + x_idx, dp, &sse, &sum);
+        fill_variance(&vst->split[k].vt.none, sse, sum, 64);
+      }
     }
   }
   // Fill the rest of the variance tree by summing the split partition
   // values.
   for (i = 0; i < 4; i++) {
     for (j = 0; j < 4; j++) {
-      fill_variance_tree(&vt.split[i].split[j])
+      fill_variance_tree(&vt.split[i].split[j], BLOCK_SIZE_MB16X16);
     }
-    fill_variance_tree(&vt.split[i])
+    fill_variance_tree(&vt.split[i], BLOCK_SIZE_SB32X32);
   }
-  fill_variance_tree(&vt)
-
-  // Now go through the entire structure,  splitting every blocksize until
+  fill_variance_tree(&vt, BLOCK_SIZE_SB64X64);
+  // Now go through the entire structure,  splitting every block size until
   // we get to one that's got a variance lower than our threshold,  or we
   // hit 8x8.
-  set_vt_size( vt, BLOCK_SIZE_SB64X64, mi_row, mi_col, return);
-  for (i = 0; i < 4; ++i) {
-    const int x32_idx = ((i & 1) << 2);
-    const int y32_idx = ((i >> 1) << 2);
-    set_vt_size(vt, BLOCK_SIZE_SB32X32, mi_row + y32_idx, mi_col + x32_idx,
-                continue);
-
-    for (j = 0; j < 4; ++j) {
-      const int x16_idx = ((j & 1) << 1);
-      const int y16_idx = ((j >> 1) << 1);
-      set_vt_size(vt, BLOCK_SIZE_MB16X16, mi_row + y32_idx + y16_idx,
-                  mi_col+x32_idx+x16_idx, continue);
-
-      for (k = 0; k < 4; ++k) {
-        const int x8_idx = (k & 1);
-        const int y8_idx = (k >> 1);
-        set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis,
-                       mi_row + y32_idx + y16_idx + y8_idx,
-                       mi_col + x32_idx + x16_idx + x8_idx);
+  if (!set_vt_partitioning(cpi, &vt, m, BLOCK_SIZE_SB64X64, mi_row, mi_col,
+                           4)) {
+    for (i = 0; i < 4; ++i) {
+      const int x32_idx = ((i & 1) << 2);
+      const int y32_idx = ((i >> 1) << 2);
+      if (!set_vt_partitioning(cpi, &vt.split[i], m, BLOCK_SIZE_SB32X32,
+                               (mi_row + y32_idx), (mi_col + x32_idx), 2)) {
+        for (j = 0; j < 4; ++j) {
+          const int x16_idx = ((j & 1) << 1);
+          const int y16_idx = ((j >> 1) << 1);
+          if (!set_vt_partitioning(cpi, &vt.split[i].split[j], m,
+                                   BLOCK_SIZE_MB16X16,
+                                   (mi_row + y32_idx + y16_idx),
+                                   (mi_col + x32_idx + x16_idx), 1)) {
+            for (k = 0; k < 4; ++k) {
+              const int x8_idx = (k & 1);
+              const int y8_idx = (k >> 1);
+              set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis,
+                             (mi_row + y32_idx + y16_idx + y8_idx),
+                             (mi_col + x32_idx + x16_idx + x8_idx));
+            }
+          }
+        }
       }
     }
   }
 }
 static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
                              int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
-                             int *rate, int *dist) {
+                             int *rate, int64_t *dist, int do_recon) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   const int mis = cm->mode_info_stride;
-  int bwl = b_width_log2(m->mbmi.sb_type);
-  int bhl = b_height_log2(m->mbmi.sb_type);
   int bsl = b_width_log2(bsize);
-  int bh = (1 << bhl);
-  int bs = (1 << bsl);
-  int bss = (1 << bsl)/4;
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int ms = num_4x4_blocks_wide / 2;
+  int mh = num_4x4_blocks_high / 2;
+  int bss = (1 << bsl) / 4;
   int i, pl;
-  PARTITION_TYPE partition;
+  PARTITION_TYPE partition = PARTITION_NONE;
   BLOCK_SIZE_TYPE subsize;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
-  int r = 0, d = 0;
+  int last_part_rate = INT_MAX;
+  int64_t last_part_dist = INT_MAX;
+  int split_rate = INT_MAX;
+  int64_t split_dist = INT_MAX;
+  int none_rate = INT_MAX;
+  int64_t none_dist = INT_MAX;
+  int chosen_rate = INT_MAX;
+  int64_t chosen_dist = INT_MAX;
+  BLOCK_SIZE_TYPE sub_subsize = BLOCK_SIZE_AB4X4;
+  int splits_below = 0;
+  BLOCK_SIZE_TYPE bs_type = m->mbmi.sb_type;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-
-  // parse the partition type
-  if ((bwl == bsl) && (bhl == bsl))
-    partition = PARTITION_NONE;
-  else if ((bwl == bsl) && (bhl < bsl))
-    partition = PARTITION_HORZ;
-  else if ((bwl < bsl) && (bhl == bsl))
-    partition = PARTITION_VERT;
-  else if ((bwl < bsl) && (bhl < bsl))
-    partition = PARTITION_SPLIT;
-  else
-    assert(0);
+  partition = partition_lookup[bsl][bs_type];
 
   subsize = get_subsize(bsize, partition);
 
-  // TODO(JBB): this restriction is here because pick_sb_modes can return
-  // r's that are INT_MAX meaning we can't select a mode / mv for this block.
-  // when the code is made to work for less than sb8x8 we need to come up with
-  // a solution to this problem.
-  assert(subsize >= BLOCK_SIZE_SB8X8);
-
-  if (bsize >= BLOCK_SIZE_SB8X8) {
-    xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
-    xd->above_seg_context = cm->above_seg_context + mi_col;
+  if (bsize < BLOCK_SIZE_SB8X8) {
+    if (xd->ab_index != 0) {
+      *rate = 0;
+      *dist = 0;
+      return;
+    }
+  } else {
     *(get_sb_partitioning(x, bsize)) = subsize;
   }
-
-  pl = partition_plane_context(xd, bsize);
   save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+  if (cpi->sf.adjust_partitioning_from_last_frame) {
+    // Check if any of the sub blocks are further split.
+    if (partition == PARTITION_SPLIT && subsize > BLOCK_SIZE_SB8X8) {
+      sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
+      splits_below = 1;
+      for (i = 0; i < 4; i++) {
+        int jj = i >> 1, ii = i & 0x01;
+        if (m[jj * bss * mis + ii * bss].mbmi.sb_type >= sub_subsize)  {
+          splits_below = 0;
+        }
+      }
+    }
+
+    // If partition is not none try none unless each of the 4 splits are split
+    // even further..
+    if (partition != PARTITION_NONE && !splits_below &&
+        mi_row + (ms >> 1) < cm->mi_rows &&
+        mi_col + (ms >> 1) < cm->mi_cols) {
+      *(get_sb_partitioning(x, bsize)) = bsize;
+      pick_sb_modes(cpi, mi_row, mi_col, &none_rate, &none_dist, bsize,
+                    get_block_context(x, bsize), INT64_MAX);
+
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      none_rate += x->partition_cost[pl][PARTITION_NONE];
+
+      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+      m->mbmi.sb_type = bs_type;
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    }
+  }
+
   switch (partition) {
     case PARTITION_NONE:
-      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
-                    get_block_context(x, bsize));
-      r += x->partition_cost[pl][PARTITION_NONE];
+      pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+                    bsize, get_block_context(x, bsize), INT64_MAX);
       break;
     case PARTITION_HORZ:
       *(get_sb_index(xd, subsize)) = 0;
-      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
-                    get_block_context(x, subsize));
-      if (mi_row + (bh >> 1) <= cm->mi_rows) {
-        int rt, dt;
+      pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+                    subsize, get_block_context(x, subsize), INT64_MAX);
+      if (last_part_rate != INT_MAX &&
+          bsize >= BLOCK_SIZE_SB8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
+        int rt = 0;
+        int64_t dt = 0;
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *(get_sb_index(xd, subsize)) = 1;
-        pick_sb_modes(cpi, mi_row + (bs >> 2), mi_col, tp, &rt, &dt, subsize,
-                      get_block_context(x, subsize));
-        r += rt;
-        d += dt;
+        pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
+                      get_block_context(x, subsize), INT64_MAX);
+        if (rt == INT_MAX || dt == INT_MAX) {
+          last_part_rate = INT_MAX;
+          last_part_dist = INT_MAX;
+          break;
+        }
+
+        last_part_rate += rt;
+        last_part_dist += dt;
       }
-      set_partition_seg_context(cm, xd, mi_row, mi_col);
-      pl = partition_plane_context(xd, bsize);
-      r += x->partition_cost[pl][PARTITION_HORZ];
       break;
     case PARTITION_VERT:
       *(get_sb_index(xd, subsize)) = 0;
-      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
-                    get_block_context(x, subsize));
-      if (mi_col + (bs >> 1) <= cm->mi_cols) {
-        int rt, dt;
+      pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+                    subsize, get_block_context(x, subsize), INT64_MAX);
+      if (last_part_rate != INT_MAX &&
+          bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
+        int rt = 0;
+        int64_t dt = 0;
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *(get_sb_index(xd, subsize)) = 1;
-        pick_sb_modes(cpi, mi_row, mi_col + (bs >> 2), tp, &rt, &dt, subsize,
-                      get_block_context(x, subsize));
-        r += rt;
-        d += dt;
+        pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
+                      get_block_context(x, subsize), INT64_MAX);
+        if (rt == INT_MAX || dt == INT_MAX) {
+          last_part_rate = INT_MAX;
+          last_part_dist = INT_MAX;
+          break;
+        }
+        last_part_rate += rt;
+        last_part_dist += dt;
       }
-      set_partition_seg_context(cm, xd, mi_row, mi_col);
-      pl = partition_plane_context(xd, bsize);
-      r += x->partition_cost[pl][PARTITION_VERT];
-      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
       break;
     case PARTITION_SPLIT:
+      // Split partition.
+      last_part_rate = 0;
+      last_part_dist = 0;
       for (i = 0; i < 4; i++) {
-        int x_idx = (i & 1) * (bs >> 2);
-        int y_idx = (i >> 1) * (bs >> 2);
+        int x_idx = (i & 1) * (ms >> 1);
+        int y_idx = (i >> 1) * (ms >> 1);
         int jj = i >> 1, ii = i & 0x01;
-        int rt, dt;
+        int rt;
+        int64_t dt;
 
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
@@ -1194,56 +1312,137 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
         *(get_sb_index(xd, subsize)) = i;
 
         rd_use_partition(cpi, m + jj * bss * mis + ii * bss, tp, mi_row + y_idx,
-                         mi_col + x_idx, subsize, &rt, &dt);
-        r += rt;
-        d += dt;
+                         mi_col + x_idx, subsize, &rt, &dt, i != 3);
+        if (rt == INT_MAX || dt == INT_MAX) {
+          last_part_rate = INT_MAX;
+          last_part_dist = INT_MAX;
+          break;
+        }
+        last_part_rate += rt;
+        last_part_dist += dt;
       }
-      set_partition_seg_context(cm, xd, mi_row, mi_col);
-      pl = partition_plane_context(xd, bsize);
-      r += x->partition_cost[pl][PARTITION_SPLIT];
       break;
     default:
       assert(0);
   }
+  set_partition_seg_context(cm, xd, mi_row, mi_col);
+  pl = partition_plane_context(xd, bsize);
+  if (last_part_rate < INT_MAX)
+    last_part_rate += x->partition_cost[pl][partition];
+
+  if (cpi->sf.adjust_partitioning_from_last_frame
+      && partition != PARTITION_SPLIT && bsize > BLOCK_SIZE_SB8X8
+      && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows)
+      && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) {
+    BLOCK_SIZE_TYPE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
+    split_rate = 0;
+    split_dist = 0;
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-  // update partition context
-#if CONFIG_AB4X4
-  if (bsize >= BLOCK_SIZE_SB8X8 &&
-      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
-#else
-  if (bsize > BLOCK_SIZE_SB8X8
-      && (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
-#endif
+    // Split partition.
+    for (i = 0; i < 4; i++) {
+      int x_idx = (i & 1) * (num_4x4_blocks_wide >> 2);
+      int y_idx = (i >> 1) * (num_4x4_blocks_wide >> 2);
+      int rt = 0;
+      int64_t dt = 0;
+      ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+      PARTITION_CONTEXT sl[8], sa[8];
+
+      if ((mi_row + y_idx >= cm->mi_rows)
+          || (mi_col + x_idx >= cm->mi_cols))
+        continue;
+
+      *(get_sb_index(xd, split_subsize)) = i;
+      *(get_sb_partitioning(x, bsize)) = split_subsize;
+      *(get_sb_partitioning(x, split_subsize)) = split_subsize;
+
+      save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+      pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
+                    split_subsize, get_block_context(x, split_subsize),
+                    INT64_MAX);
+
+      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+      if (rt == INT_MAX || dt == INT_MAX) {
+        split_rate = INT_MAX;
+        split_dist = INT_MAX;
+        break;
+      }
+
+      if (i != 3)
+        encode_sb(cpi, tp,  mi_row + y_idx, mi_col + x_idx, 0,
+                  split_subsize);
+
+      split_rate += rt;
+      split_dist += dt;
+      set_partition_seg_context(cm, xd, mi_row + y_idx, mi_col + x_idx);
+      pl = partition_plane_context(xd, bsize);
+      split_rate += x->partition_cost[pl][PARTITION_NONE];
+    }
     set_partition_seg_context(cm, xd, mi_row, mi_col);
-    update_partition_context(xd, subsize, bsize);
+    pl = partition_plane_context(xd, bsize);
+    if (split_rate < INT_MAX) {
+      split_rate += x->partition_cost[pl][PARTITION_SPLIT];
+
+      chosen_rate = split_rate;
+      chosen_dist = split_dist;
+    }
   }
+
+  // If last_part is better set the partitioning to that...
+  if (RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist)
+      < RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)) {
+    m->mbmi.sb_type = bsize;
+    if (bsize >= BLOCK_SIZE_SB8X8)
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    chosen_rate = last_part_rate;
+    chosen_dist = last_part_dist;
+  }
+  // If none was better set the partitioning to that...
+  if (RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)
+      > RDCOST(x->rdmult, x->rddiv, none_rate, none_dist)) {
+    if (bsize >= BLOCK_SIZE_SB8X8)
+      *(get_sb_partitioning(x, bsize)) = bsize;
+    chosen_rate = none_rate;
+    chosen_dist = none_dist;
+  }
+
   restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-  if (r < INT_MAX && d < INT_MAX)
+  // We must have chosen a partitioning and encoding or we'll fail later on.
+  // No other opportunities for success.
+  if ( bsize == BLOCK_SIZE_SB64X64)
+    assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
+
+  if (do_recon)
     encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
-  *rate = r;
-  *dist = d;
+
+  *rate = chosen_rate;
+  *dist = chosen_dist;
 }
 
 
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previously rate-distortion optimization
 // results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
-                              int mi_row, int mi_col,
-                              BLOCK_SIZE_TYPE bsize,
-                              int *rate, int *dist) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
+                              int mi_col, BLOCK_SIZE_TYPE bsize, int *rate,
+                              int64_t *dist, int do_recon, int64_t best_rd) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   int bsl = b_width_log2(bsize), bs = 1 << bsl;
   int ms = bs / 2;
-  ENTROPY_CONTEXT   l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   TOKENEXTRA *tp_orig = *tp;
   int i, pl;
   BLOCK_SIZE_TYPE subsize;
-  int srate = INT_MAX, sdist = INT_MAX;
+  int srate = INT_MAX;
+  int64_t sdist = INT_MAX;
+
+  (void) *tp_orig;
 
   if (bsize < BLOCK_SIZE_SB8X8)
     if (xd->ab_index != 0) {
@@ -1256,127 +1455,343 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
   save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
   // PARTITION_SPLIT
-  if (bsize >= BLOCK_SIZE_SB8X8) {
-    int r4 = 0, d4 = 0;
-    subsize = get_subsize(bsize, PARTITION_SPLIT);
-    *(get_sb_partitioning(x, bsize)) = subsize;
+  if (!cpi->sf.use_partitions_greater_than
+      || (cpi->sf.use_partitions_greater_than
+          && bsize > cpi->sf.greater_than_block_size)) {
+    if (bsize > BLOCK_SIZE_SB8X8) {
+      int r4 = 0;
+      int64_t d4 = 0, sum_rd = 0;
+      subsize = get_subsize(bsize, PARTITION_SPLIT);
+
+      for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+        int x_idx = (i & 1) * (ms >> 1);
+        int y_idx = (i >> 1) * (ms >> 1);
+        int r = 0;
+        int64_t d = 0;
 
-    for (i = 0; i < 4; ++i) {
-      int x_idx = (i & 1) * (ms >> 1);
-      int y_idx = (i >> 1) * (ms >> 1);
-      int r = 0, d = 0;
-
-      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
-        continue;
+        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+          continue;
 
-      *(get_sb_index(xd, subsize)) = i;
-      rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
-                        &r, &d);
+        *(get_sb_index(xd, subsize)) = i;
+        rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r,
+                          &d, i != 3, best_rd - sum_rd);
 
-      r4 += r;
-      d4 += d;
+        if (r == INT_MAX) {
+          r4 = INT_MAX;
+          sum_rd = INT64_MAX;
+        } else {
+          r4 += r;
+          d4 += d;
+          sum_rd = RDCOST(x->rdmult, x->rddiv, r4, d4);
+        }
+      }
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      if (r4 != INT_MAX && i == 4) {
+        r4 += x->partition_cost[pl][PARTITION_SPLIT];
+        *(get_sb_partitioning(x, bsize)) = subsize;
+        assert(r4 >= 0);
+        assert(d4 >= 0);
+        srate = r4;
+        sdist = d4;
+        best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r4, d4));
+      }
+      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
     }
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    pl = partition_plane_context(xd, bsize);
-    if (r4 < INT_MAX)
-      r4 += x->partition_cost[pl][PARTITION_SPLIT];
-    assert(r4 >= 0);
-    assert(d4 >= 0);
-    srate = r4;
-    sdist = d4;
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
-  // PARTITION_HORZ
-  if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
-    int r2, d2;
-    int r = 0, d = 0;
-    subsize = get_subsize(bsize, PARTITION_HORZ);
-    *(get_sb_index(xd, subsize)) = 0;
-    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
-                  get_block_context(x, subsize));
-
-    if (mi_row + (ms >> 1) < cm->mi_rows) {
-      update_state(cpi, get_block_context(x, subsize), subsize, 0);
-      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
-      *(get_sb_index(xd, subsize)) = 1;
-      pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
-                    get_block_context(x, subsize));
-      r2 += r;
-      d2 += d;
-    }
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    pl = partition_plane_context(xd, bsize);
-    if (r2 < INT_MAX)
-      r2 += x->partition_cost[pl][PARTITION_HORZ];
-    if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
-        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
-      srate = r2;
-      sdist = d2;
-      *(get_sb_partitioning(x, bsize)) = subsize;
+  x->fast_ms = 0;
+  x->pred_mv.as_int = 0;
+  x->subblock_ref = 0;
+
+  // Use 4 subblocks' motion estimation results to speed up current
+  // partition's checking.
+  if (cpi->sf.using_small_partition_info) {
+    // Only use 8x8 result for non HD videos.
+    // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0;
+    int use_8x8 = 1;
+
+    if (cm->frame_type && !cpi->is_src_frame_alt_ref &&
+        ((use_8x8 && bsize == BLOCK_SIZE_MB16X16) ||
+        bsize == BLOCK_SIZE_SB32X32 || bsize == BLOCK_SIZE_SB64X64)) {
+      int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0;
+
+      if (bsize == BLOCK_SIZE_MB16X16) {
+        ref0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.
+            ref_frame[0];
+        ref1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.
+            ref_frame[0];
+        ref2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.
+            ref_frame[0];
+        ref3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.
+            ref_frame[0];
+      } else if (bsize == BLOCK_SIZE_SB32X32) {
+        ref0 = x->mb_context[xd->sb_index][0].mic.mbmi.ref_frame[0];
+        ref1 = x->mb_context[xd->sb_index][1].mic.mbmi.ref_frame[0];
+        ref2 = x->mb_context[xd->sb_index][2].mic.mbmi.ref_frame[0];
+        ref3 = x->mb_context[xd->sb_index][3].mic.mbmi.ref_frame[0];
+      } else if (bsize == BLOCK_SIZE_SB64X64) {
+        ref0 = x->sb32_context[0].mic.mbmi.ref_frame[0];
+        ref1 = x->sb32_context[1].mic.mbmi.ref_frame[0];
+        ref2 = x->sb32_context[2].mic.mbmi.ref_frame[0];
+        ref3 = x->sb32_context[3].mic.mbmi.ref_frame[0];
+      }
+
+      // Currently, only consider 4 inter ref frames.
+      if (ref0 && ref1 && ref2 && ref3) {
+        int16_t mvr0 = 0, mvc0 = 0, mvr1 = 0, mvc1 = 0, mvr2 = 0, mvc2 = 0,
+            mvr3 = 0, mvc3 = 0;
+        int d01, d23, d02, d13;  // motion vector distance between 2 blocks
+
+        // Get each subblock's motion vectors.
+        if (bsize == BLOCK_SIZE_MB16X16) {
+          mvr0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.mv[0].
+              as_mv.row;
+          mvc0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.mv[0].
+              as_mv.col;
+          mvr1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.mv[0].
+              as_mv.row;
+          mvc1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.mv[0].
+              as_mv.col;
+          mvr2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.mv[0].
+              as_mv.row;
+          mvc2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.mv[0].
+              as_mv.col;
+          mvr3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.mv[0].
+              as_mv.row;
+          mvc3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.mv[0].
+              as_mv.col;
+        } else if (bsize == BLOCK_SIZE_SB32X32) {
+          mvr0 = x->mb_context[xd->sb_index][0].mic.mbmi.mv[0].as_mv.row;
+          mvc0 = x->mb_context[xd->sb_index][0].mic.mbmi.mv[0].as_mv.col;
+          mvr1 = x->mb_context[xd->sb_index][1].mic.mbmi.mv[0].as_mv.row;
+          mvc1 = x->mb_context[xd->sb_index][1].mic.mbmi.mv[0].as_mv.col;
+          mvr2 = x->mb_context[xd->sb_index][2].mic.mbmi.mv[0].as_mv.row;
+          mvc2 = x->mb_context[xd->sb_index][2].mic.mbmi.mv[0].as_mv.col;
+          mvr3 = x->mb_context[xd->sb_index][3].mic.mbmi.mv[0].as_mv.row;
+          mvc3 = x->mb_context[xd->sb_index][3].mic.mbmi.mv[0].as_mv.col;
+        } else if (bsize == BLOCK_SIZE_SB64X64) {
+          mvr0 = x->sb32_context[0].mic.mbmi.mv[0].as_mv.row;
+          mvc0 = x->sb32_context[0].mic.mbmi.mv[0].as_mv.col;
+          mvr1 = x->sb32_context[1].mic.mbmi.mv[0].as_mv.row;
+          mvc1 = x->sb32_context[1].mic.mbmi.mv[0].as_mv.col;
+          mvr2 = x->sb32_context[2].mic.mbmi.mv[0].as_mv.row;
+          mvc2 = x->sb32_context[2].mic.mbmi.mv[0].as_mv.col;
+          mvr3 = x->sb32_context[3].mic.mbmi.mv[0].as_mv.row;
+          mvc3 = x->sb32_context[3].mic.mbmi.mv[0].as_mv.col;
+        }
+
+        // Adjust sign if ref is alt_ref
+        if (cm->ref_frame_sign_bias[ref0]) {
+          mvr0 *= -1;
+          mvc0 *= -1;
+        }
+
+        if (cm->ref_frame_sign_bias[ref1]) {
+          mvr1 *= -1;
+          mvc1 *= -1;
+        }
+
+        if (cm->ref_frame_sign_bias[ref2]) {
+          mvr2 *= -1;
+          mvc2 *= -1;
+        }
+
+        if (cm->ref_frame_sign_bias[ref3]) {
+          mvr3 *= -1;
+          mvc3 *= -1;
+        }
+
+        // Calculate mv distances.
+        d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1));
+        d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3));
+        d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2));
+        d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3));
+
+        if (d01 < 24 && d23 < 24 && d02 < 24 && d13 < 24) {
+          // Set fast motion search level.
+          x->fast_ms = 1;
+
+          // Calculate prediction MV
+          x->pred_mv.as_mv.row = (mvr0 + mvr1 + mvr2 + mvr3) >> 2;
+          x->pred_mv.as_mv.col = (mvc0 + mvc1 + mvc2 + mvc3) >> 2;
+
+          if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 &&
+              d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) {
+            // Set fast motion search level.
+            x->fast_ms = 2;
+
+            if (!d01 && !d23 && !d02 && !d13) {
+              x->fast_ms = 3;
+              x->subblock_ref = ref0;
+            }
+          }
+        }
+      }
     }
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
-  // PARTITION_VERT
-  if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
-    int r2, d2;
-    subsize = get_subsize(bsize, PARTITION_VERT);
-    *(get_sb_index(xd, subsize)) = 0;
-    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
-                  get_block_context(x, subsize));
-    if (mi_col + (ms >> 1) < cm->mi_cols) {
-      int r = 0, d = 0;
-      update_state(cpi, get_block_context(x, subsize), subsize, 0);
-      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
-      *(get_sb_index(xd, subsize)) = 1;
-      pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
-                    get_block_context(x, subsize));
-      r2 += r;
-      d2 += d;
-    }
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    pl = partition_plane_context(xd, bsize);
-    if (r2 < INT_MAX)
-      r2 += x->partition_cost[pl][PARTITION_VERT];
-    if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
-        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
-      srate = r2;
-      sdist = d2;
-      *(get_sb_partitioning(x, bsize)) = subsize;
+  if (!cpi->sf.use_partitions_less_than
+      || (cpi->sf.use_partitions_less_than
+          && bsize <= cpi->sf.less_than_block_size)) {
+    int larger_is_better = 0;
+    // PARTITION_NONE
+    if ((mi_row + (ms >> 1) < cm->mi_rows) &&
+        (mi_col + (ms >> 1) < cm->mi_cols)) {
+      int r;
+      int64_t d;
+      pick_sb_modes(cpi, mi_row, mi_col, &r, &d, bsize,
+                    get_block_context(x, bsize), best_rd);
+      if (r != INT_MAX && bsize >= BLOCK_SIZE_SB8X8) {
+        set_partition_seg_context(cm, xd, mi_row, mi_col);
+        pl = partition_plane_context(xd, bsize);
+        r += x->partition_cost[pl][PARTITION_NONE];
+      }
+
+      if (r != INT_MAX &&
+          (bsize == BLOCK_SIZE_SB8X8 ||
+           RDCOST(x->rdmult, x->rddiv, r, d) <
+               RDCOST(x->rdmult, x->rddiv, srate, sdist))) {
+        best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r, d));
+        srate = r;
+        sdist = d;
+        larger_is_better = 1;
+        if (bsize >= BLOCK_SIZE_SB8X8)
+          *(get_sb_partitioning(x, bsize)) = bsize;
+      }
     }
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
-  }
 
-  // PARTITION_NONE
-  if ((mi_row + (ms >> 1) < cm->mi_rows) &&
-      (mi_col + (ms >> 1) < cm->mi_cols)) {
-    int r, d;
-    pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
-                  get_block_context(x, bsize));
-    if (bsize >= BLOCK_SIZE_SB8X8) {
+    if (bsize == BLOCK_SIZE_SB8X8) {
+      int r4 = 0;
+      int64_t d4 = 0, sum_rd = 0;
+      subsize = get_subsize(bsize, PARTITION_SPLIT);
+
+      for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+        int x_idx = (i & 1) * (ms >> 1);
+        int y_idx = (i >> 1) * (ms >> 1);
+        int r = 0;
+        int64_t d = 0;
+
+        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+          continue;
+
+        *(get_sb_index(xd, subsize)) = i;
+        rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r,
+                          &d, i != 3, best_rd - sum_rd);
+
+        if (r == INT_MAX) {
+          r4 = INT_MAX;
+          sum_rd = INT64_MAX;
+        } else {
+          r4 += r;
+          d4 += d;
+          sum_rd = RDCOST(x->rdmult, x->rddiv, r4, d4);
+        }
+      }
       set_partition_seg_context(cm, xd, mi_row, mi_col);
       pl = partition_plane_context(xd, bsize);
-      r += x->partition_cost[pl][PARTITION_NONE];
+      if (r4 != INT_MAX && i == 4) {
+        r4 += x->partition_cost[pl][PARTITION_SPLIT];
+        if (RDCOST(x->rdmult, x->rddiv, r4, d4) <
+            RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+          srate = r4;
+          sdist = d4;
+          larger_is_better = 0;
+          *(get_sb_partitioning(x, bsize)) = subsize;
+          best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r4, d4));
+        }
+      }
+      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
     }
 
-    if (RDCOST(x->rdmult, x->rddiv, r, d) <
-        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
-      srate = r;
-      sdist = d;
-      if (bsize >= BLOCK_SIZE_SB8X8)
-        *(get_sb_partitioning(x, bsize)) = bsize;
+    if (!cpi->sf.use_square_partition_only &&
+        (!cpi->sf.less_rectangular_check ||!larger_is_better)) {
+      // PARTITION_HORZ
+      if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
+        int r2, r = 0;
+        int64_t d2, d = 0, h_rd;
+        subsize = get_subsize(bsize, PARTITION_HORZ);
+        *(get_sb_index(xd, subsize)) = 0;
+        pick_sb_modes(cpi, mi_row, mi_col, &r2, &d2, subsize,
+                      get_block_context(x, subsize), best_rd);
+        h_rd = RDCOST(x->rdmult, x->rddiv, r2, d2);
+
+        if (r2 != INT_MAX && h_rd < best_rd &&
+            mi_row + (ms >> 1) < cm->mi_rows) {
+          update_state(cpi, get_block_context(x, subsize), subsize, 0);
+          encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+
+          *(get_sb_index(xd, subsize)) = 1;
+          pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &r, &d, subsize,
+                        get_block_context(x, subsize), best_rd - h_rd);
+          if (r == INT_MAX) {
+            r2 = INT_MAX;
+          } else {
+            r2 += r;
+            d2 += d;
+          }
+        }
+        set_partition_seg_context(cm, xd, mi_row, mi_col);
+        pl = partition_plane_context(xd, bsize);
+        if (r2 < INT_MAX)
+          r2 += x->partition_cost[pl][PARTITION_HORZ];
+        if (r2 != INT_MAX && RDCOST(x->rdmult, x->rddiv, r2, d2)
+            < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+          best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r2, d2));
+          srate = r2;
+          sdist = d2;
+          *(get_sb_partitioning(x, bsize)) = subsize;
+        }
+        restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+      }
+
+      // PARTITION_VERT
+      if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
+        int r2;
+        int64_t d2, v_rd;
+        subsize = get_subsize(bsize, PARTITION_VERT);
+        *(get_sb_index(xd, subsize)) = 0;
+        pick_sb_modes(cpi, mi_row, mi_col, &r2, &d2, subsize,
+                      get_block_context(x, subsize), best_rd);
+        v_rd = RDCOST(x->rdmult, x->rddiv, r2, d2);
+        if (r2 != INT_MAX && v_rd < best_rd &&
+            mi_col + (ms >> 1) < cm->mi_cols) {
+          int r = 0;
+          int64_t d = 0;
+          update_state(cpi, get_block_context(x, subsize), subsize, 0);
+          encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+
+          *(get_sb_index(xd, subsize)) = 1;
+          pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &r, &d, subsize,
+                        get_block_context(x, subsize), best_rd - v_rd);
+          if (r == INT_MAX) {
+            r2 = INT_MAX;
+          } else {
+            r2 += r;
+            d2 += d;
+          }
+        }
+        set_partition_seg_context(cm, xd, mi_row, mi_col);
+        pl = partition_plane_context(xd, bsize);
+        if (r2 < INT_MAX)
+          r2 += x->partition_cost[pl][PARTITION_VERT];
+        if (r2 != INT_MAX &&
+            RDCOST(x->rdmult, x->rddiv, r2, d2)
+            < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+          srate = r2;
+          sdist = d2;
+          *(get_sb_partitioning(x, bsize)) = subsize;
+        }
+        restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+      }
     }
   }
-
   *rate = srate;
   *dist = sdist;
 
   restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-  if (srate < INT_MAX && sdist < INT_MAX)
+  if (srate < INT_MAX && sdist < INT_MAX && do_recon)
     encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
 
   if (bsize == BLOCK_SIZE_SB64X64) {
@@ -1388,9 +1803,61 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
   }
 }
 
-static void encode_sb_row(VP9_COMP *cpi, int mi_row,
-                       TOKENEXTRA **tp, int *totalrate) {
-  VP9_COMMON *const cm = &cpi->common;
+// Examines 64x64 block and chooses a best reference frame
+static void rd_pick_reference_frame(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
+                                    int mi_col, int *rate, int64_t *dist) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
+  int bsl = b_width_log2(BLOCK_SIZE_SB64X64), bs = 1 << bsl;
+  int ms = bs / 2;
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  int pl;
+  int r;
+  int64_t d;
+
+  save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64);
+
+  // Default is non mask (all reference frames allowed.
+  cpi->ref_frame_mask = 0;
+
+  // Do RD search for 64x64.
+  if ((mi_row + (ms >> 1) < cm->mi_rows) &&
+      (mi_col + (ms >> 1) < cm->mi_cols)) {
+    cpi->set_ref_frame_mask = 1;
+    pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_SIZE_SB64X64,
+                  get_block_context(x, BLOCK_SIZE_SB64X64), INT64_MAX);
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
+    r += x->partition_cost[pl][PARTITION_NONE];
+
+    *(get_sb_partitioning(x, BLOCK_SIZE_SB64X64)) = BLOCK_SIZE_SB64X64;
+    cpi->set_ref_frame_mask = 0;
+  }
+
+  *rate = r;
+  *dist = d;
+  // RDCOST(x->rdmult, x->rddiv, r, d)
+
+  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64);
+
+  /*if (srate < INT_MAX && sdist < INT_MAX)
+    encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64);
+
+  if (bsize == BLOCK_SIZE_SB64X64) {
+    assert(tp_orig < *tp);
+    assert(srate < INT_MAX);
+    assert(sdist < INT_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+  */
+}
+
+static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
+                          int *totalrate) {
+  VP9_COMMON * const cm = &cpi->common;
   int mi_col;
 
   // Initialize the left context for the new SB row
@@ -1398,19 +1865,56 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row,
   vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = cm->cur_tile_mi_col_start;
-       mi_col < cm->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE) {
-    int dummy_rate, dummy_dist;
-    if (cpi->speed < 5) {
-      rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
-                        &dummy_rate, &dummy_dist);
-    } else {
+  for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+       mi_col += MI_BLOCK_SIZE) {
+    int dummy_rate;
+    int64_t dummy_dist;
+
+    // Initialize a mask of modes that we will not consider;
+    // cpi->unused_mode_skip_mask = 0x0000000AAE17F800 (test no golden)
+    if (cpi->common.frame_type == KEY_FRAME)
+      cpi->unused_mode_skip_mask = 0;
+    else
+      cpi->unused_mode_skip_mask = 0xFFFFFFFFFFFFFE00;
+
+    if (cpi->sf.reference_masking) {
+      rd_pick_reference_frame(cpi, tp, mi_row, mi_col,
+                              &dummy_rate, &dummy_dist);
+    }
+
+    if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning ||
+        cpi->sf.use_one_partition_size_always ) {
       const int idx_str = cm->mode_info_stride * mi_row + mi_col;
       MODE_INFO *m = cm->mi + idx_str;
-      // set_partitioning(cpi, m, BLOCK_SIZE_SB64X64);
-      choose_partitioning(cpi, cm->mi, mi_row, mi_col);
-      rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
-                       &dummy_rate, &dummy_dist);
+      MODE_INFO *p = cm->prev_mi + idx_str;
+
+      if (cpi->sf.use_one_partition_size_always) {
+        set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+        set_partitioning(cpi, m, cpi->sf.always_this_block_size);
+        rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                         &dummy_rate, &dummy_dist, 1);
+      } else if (cpi->sf.partition_by_variance) {
+        choose_partitioning(cpi, cm->mi, mi_row, mi_col);
+        rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                         &dummy_rate, &dummy_dist, 1);
+      } else {
+        if ((cpi->common.current_video_frame
+            % cpi->sf.last_partitioning_redo_frequency) == 0
+            || cm->prev_mi == 0
+            || cpi->common.show_frame == 0
+            || cpi->common.frame_type == KEY_FRAME
+            || cpi->is_src_frame_alt_ref) {
+          rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                            &dummy_rate, &dummy_dist, 1, INT64_MAX);
+        } else {
+          copy_partitioning(cpi, m, p);
+          rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                           &dummy_rate, &dummy_dist, 1);
+        }
+      }
+    } else {
+      rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                        &dummy_rate, &dummy_dist, 1, INT64_MAX);
     }
   }
 }
@@ -1419,15 +1923,12 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
 
   x->act_zbin_adj = 0;
   cpi->seg0_idx = 0;
 
   xd->mode_info_stride = cm->mode_info_stride;
-  xd->frame_type = cm->frame_type;
-
-  xd->frames_since_golden = cm->frames_since_golden;
-  xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
 
   // reset intra mode contexts
   if (cm->frame_type == KEY_FRAME)
@@ -1437,62 +1938,65 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
 
   // TODO(jkoleszar): are these initializations required?
-  setup_pre_planes(xd, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], NULL,
-                   0, 0, NULL, NULL);
+  setup_pre_planes(xd, 0, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]],
+                   0, 0, NULL);
   setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
 
-  vp9_build_block_offsets(x);
-
-  vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+  setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   xd->mode_info_context->mbmi.mode = DC_PRED;
   xd->mode_info_context->mbmi.uv_mode = DC_PRED;
 
   vp9_zero(cpi->y_mode_count)
   vp9_zero(cpi->y_uv_mode_count)
-  vp9_zero(cm->fc.inter_mode_counts)
+  vp9_zero(cm->counts.inter_mode)
   vp9_zero(cpi->partition_count);
   vp9_zero(cpi->intra_inter_count);
   vp9_zero(cpi->comp_inter_count);
   vp9_zero(cpi->single_ref_count);
   vp9_zero(cpi->comp_ref_count);
-  vp9_zero(cm->fc.tx_count_32x32p);
-  vp9_zero(cm->fc.tx_count_16x16p);
-  vp9_zero(cm->fc.tx_count_8x8p);
-  vp9_zero(cm->fc.mbskip_count);
+  vp9_zero(cm->counts.tx);
+  vp9_zero(cm->counts.mbskip);
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(cm->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *
-                                      MAX_MB_PLANE * mi_cols_aligned_to_sb(cm));
-  vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
-                                       mi_cols_aligned_to_sb(cm));
+  vpx_memset(cm->above_context[0], 0,
+             sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols);
+  vpx_memset(cm->above_seg_context, 0,
+             sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
 }
 
 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   if (lossless) {
-    cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;
-    cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
-    cpi->mb.optimize              = 0;
-    cpi->common.filter_level      = 0;
-    cpi->zbin_mode_boost_enabled  = 0;
-    cpi->common.txfm_mode         = ONLY_4X4;
+    // printf("Switching to lossless\n");
+    cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
+    cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
+    cpi->mb.optimize = 0;
+    cpi->mb.e_mbd.lf.filter_level = 0;
+    cpi->zbin_mode_boost_enabled = 0;
+    cpi->common.tx_mode = ONLY_4X4;
   } else {
-    cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;
-    cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
+    // printf("Not lossless\n");
+    cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
+    cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
   }
 }
 
+static void switch_tx_mode(VP9_COMP *cpi) {
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+      cpi->common.tx_mode >= ALLOW_32X32)
+    cpi->common.tx_mode = ALLOW_32X32;
+}
 
 static void encode_frame_internal(VP9_COMP *cpi) {
   int mi_row;
-  MACROBLOCK *const x = &cpi->mb;
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCK * const x = &cpi->mb;
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCKD * const xd = &x->e_mbd;
   int totalrate;
 
 //  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
@@ -1514,26 +2018,25 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   // Reset frame count of inter 0,0 motion vector usage.
   cpi->inter_zz_count = 0;
 
-  vp9_zero(cm->fc.switchable_interp_count);
-  vp9_zero(cpi->best_switchable_interp_count);
+  vp9_zero(cm->counts.switchable_interp);
+  vp9_zero(cpi->txfm_stepdown_count);
 
   xd->mode_info_context = cm->mi;
   xd->prev_mode_info_context = cm->prev_mi;
 
   vp9_zero(cpi->NMVcount);
   vp9_zero(cpi->coef_counts);
-  vp9_zero(cm->fc.eob_branch_counts);
+  vp9_zero(cm->counts.eob_branch);
 
-  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 &&
-                           cm->y_dc_delta_q == 0 &&
-                           cm->uv_dc_delta_q == 0 &&
-                           cm->uv_ac_delta_q == 0;
+  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0
+      && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
   switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
 
   vp9_frame_init_quantizer(cpi);
 
   vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q);
   vp9_initialize_me_consts(cpi, cm->base_qindex);
+  switch_tx_mode(cpi);
 
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
     // Initialize encode frame context.
@@ -1546,36 +2049,38 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   // re-initencode frame context.
   init_encode_frame_mb_context(cpi);
 
-  vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
-  vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
-  vpx_memset(cpi->rd_tx_select_threshes, 0, sizeof(cpi->rd_tx_select_threshes));
+  vp9_zero(cpi->rd_comp_pred_diff);
+  vp9_zero(cpi->rd_filter_diff);
+  vp9_zero(cpi->rd_tx_select_diff);
+  vp9_zero(cpi->rd_tx_select_threshes);
 
   set_prev_mi(cm);
 
   {
-    struct vpx_usec_timer  emr_timer;
+    struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
 
     {
       // Take tiles into account and give start/end MB
       int tile_col, tile_row;
       TOKENEXTRA *tp = cpi->tok;
+      const int tile_cols = 1 << cm->log2_tile_cols;
+      const int tile_rows = 1 << cm->log2_tile_rows;
 
-      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
         vp9_get_tile_row_offsets(cm, tile_row);
 
-        for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
+        for (tile_col = 0; tile_col < tile_cols; tile_col++) {
           TOKENEXTRA *tp_old = tp;
 
           // For each row of SBs in the frame
           vp9_get_tile_col_offsets(cm, tile_col);
           for (mi_row = cm->cur_tile_mi_row_start;
-               mi_row < cm->cur_tile_mi_row_end;
-               mi_row += 8)
+               mi_row < cm->cur_tile_mi_row_end; mi_row += 8)
             encode_sb_row(cpi, mi_row, &tp, &totalrate);
+
           cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
-          assert(tp - cpi->tok <=
-                 get_token_alloc(cm->mb_rows, cm->mb_cols));
+          assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
         }
       }
     }
@@ -1584,6 +2089,20 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
   }
 
+  if (cpi->sf.skip_encode_sb) {
+    int j;
+    unsigned int intra_count = 0, inter_count = 0;
+    for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
+      intra_count += cpi->intra_inter_count[j][0];
+      inter_count += cpi->intra_inter_count[j][1];
+    }
+    cpi->sf.skip_encode_frame = ((intra_count << 2) < inter_count);
+    cpi->sf.skip_encode_frame &= (cm->frame_type != KEY_FRAME);
+    cpi->sf.skip_encode_frame &= cm->show_frame;
+  } else {
+    cpi->sf.skip_encode_frame = 0;
+  }
+
   // 256 rate units to the bit,
   // projected_frame_size in units of BYTES
   cpi->projected_frame_size = totalrate >> 8;
@@ -1599,12 +2118,11 @@ static int check_dual_ref_flags(VP9_COMP *cpi) {
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   int ref_flags = cpi->ref_frame_flags;
 
-  if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
+  if (vp9_segfeature_active(&xd->seg, 1, SEG_LVL_REF_FRAME)) {
     return 0;
   } else {
-    return (!!(ref_flags & VP9_GOLD_FLAG) +
-            !!(ref_flags & VP9_LAST_FLAG) +
-            !!(ref_flags & VP9_ALT_FLAG)) >= 2;
+    return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
+        + !!(ref_flags & VP9_ALT_FLAG)) >= 2;
   }
 }
 
@@ -1631,35 +2149,32 @@ static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
   }
 }
 
-static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi,
-                                   int mis, TX_SIZE txfm_max,
-                                   int bw, int bh, int mi_row, int mi_col,
-                                   BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi, int mis,
+                                   TX_SIZE txfm_max, int bw, int bh, int mi_row,
+                                   int mi_col, BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON * const cm = &cpi->common;
+  MB_MODE_INFO * const mbmi = &mi->mbmi;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
   if (mbmi->txfm_size > txfm_max) {
-    MACROBLOCK *const x = &cpi->mb;
-    MACROBLOCKD *const xd = &x->e_mbd;
-    const int segment_id = mbmi->segment_id;
+    MACROBLOCK * const x = &cpi->mb;
+    MACROBLOCKD * const xd = &x->e_mbd;
     const int ymbs = MIN(bh, cm->mi_rows - mi_row);
     const int xmbs = MIN(bw, cm->mi_cols - mi_col);
 
     xd->mode_info_context = mi;
-    assert(vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) ||
+    assert(vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
            get_skip_flag(mi, mis, ymbs, xmbs));
     set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
   }
 }
 
 static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
-                                    TX_SIZE txfm_max,
-                                    int mi_row, int mi_col,
+                                    TX_SIZE txfm_max, int mi_row, int mi_col,
                                     BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
+  VP9_COMMON * const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
   int bwl, bhl;
   const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
@@ -1671,18 +2186,18 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
   bhl = mi_height_log2(mi->mbmi.sb_type);
 
   if (bwl == bsl && bhl == bsl) {
-    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl,
-                           mi_row, mi_col, bsize);
+    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl, mi_row,
+                           mi_col, bsize);
   } else if (bwl == bsl && bhl < bsl) {
-    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs,
-                           mi_row, mi_col, bsize);
+    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs, mi_row, mi_col,
+                           bsize);
     reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs,
                            mi_row + bs, mi_col, bsize);
   } else if (bwl < bsl && bhl == bsl) {
-    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl,
-                           mi_row, mi_col, bsize);
-    reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl,
-                           mi_row, mi_col + bs, bsize);
+    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl, mi_row, mi_col,
+                           bsize);
+    reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl, mi_row,
+                           mi_col + bs, bsize);
   } else {
     BLOCK_SIZE_TYPE subsize;
     int n;
@@ -1700,43 +2215,82 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
     for (n = 0; n < 4; n++) {
       const int y_idx = n >> 1, x_idx = n & 0x01;
 
-      reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,
-                              txfm_max, mi_row + y_idx * bs,
-                              mi_col + x_idx * bs, subsize);
+      reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, txfm_max,
+                              mi_row + y_idx * bs, mi_col + x_idx * bs,
+                              subsize);
     }
   }
 }
 
 static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
-  VP9_COMMON *const cm = &cpi->common;
+  VP9_COMMON * const cm = &cpi->common;
   int mi_row, mi_col;
   const int mis = cm->mode_info_stride;
   MODE_INFO *mi, *mi_ptr = cm->mi;
 
-  for (mi_row = 0; mi_row < cm->mi_rows;
-       mi_row += 8, mi_ptr += 8 * mis) {
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) {
     mi = mi_ptr;
-    for (mi_col = 0; mi_col < cm->mi_cols;
-         mi_col += 8, mi += 8) {
-      reset_skip_txfm_size_sb(cpi, mi, txfm_max,
-                              mi_row, mi_col, BLOCK_SIZE_SB64X64);
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi += 8) {
+      reset_skip_txfm_size_sb(cpi, mi, txfm_max, mi_row, mi_col,
+                              BLOCK_SIZE_SB64X64);
+    }
+  }
+}
+
+static int get_frame_type(VP9_COMP *cpi) {
+  int frame_type;
+  if (cpi->common.frame_type == KEY_FRAME)
+    frame_type = 0;
+  else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
+    frame_type = 3;
+  else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
+    frame_type = 1;
+  else
+    frame_type = 2;
+  return frame_type;
+}
+
+static void select_tx_mode(VP9_COMP *cpi) {
+  if (cpi->oxcf.lossless) {
+    cpi->common.tx_mode = ONLY_4X4;
+  } else if (cpi->common.current_video_frame == 0) {
+    cpi->common.tx_mode = TX_MODE_SELECT;
+  } else {
+    if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
+      cpi->common.tx_mode = ALLOW_32X32;
+    } else if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+      int frame_type = get_frame_type(cpi);
+      cpi->common.tx_mode =
+          cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32]
+          > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+          ALLOW_32X32 : TX_MODE_SELECT;
+    } else {
+      unsigned int total = 0;
+      int i;
+      for (i = 0; i < TX_SIZE_MAX_SB; ++i)
+        total += cpi->txfm_stepdown_count[i];
+      if (total) {
+        double fraction = (double)cpi->txfm_stepdown_count[0] / total;
+        cpi->common.tx_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT;
+        // printf("fraction = %f\n", fraction);
+      }  // else keep unchanged
     }
   }
 }
 
 void vp9_encode_frame(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
+  VP9_COMMON * const cm = &cpi->common;
 
   // In the longer term the encoder should be generalized to match the
   // decoder such that we allow compound where one of the 3 buffers has a
-  // differnt sign bias and that buffer is then the fixed ref. However, this
+  // different sign bias and that buffer is then the fixed ref. However, this
   // requires further work in the rd loop. For now the only supported encoder
-  // side behaviour is where the ALT ref buffer has oppositie sign bias to
+  // side behaviour is where the ALT ref buffer has opposite sign bias to
   // the other two.
-  if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
-       cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
-      (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
-       cm->ref_frame_sign_bias[LAST_FRAME])) {
+  if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
+       == cm->ref_frame_sign_bias[GOLDEN_FRAME])
+      || (cm->ref_frame_sign_bias[ALTREF_FRAME]
+          == cm->ref_frame_sign_bias[LAST_FRAME])) {
     cm->allow_comp_inter_inter = 0;
   } else {
     cm->allow_comp_inter_inter = 1;
@@ -1746,9 +2300,8 @@ void vp9_encode_frame(VP9_COMP *cpi) {
   }
 
   if (cpi->sf.RD) {
-    int i, frame_type, pred_type;
-    TXFM_MODE txfm_type;
-
+    int i, pred_type;
+    INTERPOLATIONFILTERTYPE filter_type;
     /*
      * This code does a single RD pass over the whole frame assuming
      * either compound, single or hybrid prediction as per whatever has
@@ -1758,86 +2311,78 @@ void vp9_encode_frame(VP9_COMP *cpi) {
      * that for subsequent frames.
      * It does the same analysis for transform size selection also.
      */
-    if (cpi->common.frame_type == KEY_FRAME)
-      frame_type = 0;
-    else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
-      frame_type = 3;
-    else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
-      frame_type = 1;
-    else
-      frame_type = 2;
+    int frame_type = get_frame_type(cpi);
 
     /* prediction (compound, single or hybrid) mode selection */
     if (frame_type == 3 || !cm->allow_comp_inter_inter)
       pred_type = SINGLE_PREDICTION_ONLY;
-    else if (cpi->rd_prediction_type_threshes[frame_type][1] >
-                 cpi->rd_prediction_type_threshes[frame_type][0] &&
-             cpi->rd_prediction_type_threshes[frame_type][1] >
-                 cpi->rd_prediction_type_threshes[frame_type][2] &&
-             check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
+    else if (cpi->rd_prediction_type_threshes[frame_type][1]
+             > cpi->rd_prediction_type_threshes[frame_type][0]
+             && cpi->rd_prediction_type_threshes[frame_type][1]
+             > cpi->rd_prediction_type_threshes[frame_type][2]
+             && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
       pred_type = COMP_PREDICTION_ONLY;
-    else if (cpi->rd_prediction_type_threshes[frame_type][0] >
-                 cpi->rd_prediction_type_threshes[frame_type][2])
+    else if (cpi->rd_prediction_type_threshes[frame_type][0]
+             > cpi->rd_prediction_type_threshes[frame_type][2])
       pred_type = SINGLE_PREDICTION_ONLY;
     else
       pred_type = HYBRID_PREDICTION;
 
+    /* filter type selection */
+    // FIXME(rbultje) for some odd reason, we often select smooth_filter
+    // as default filter for ARF overlay frames. This is a REALLY BAD
+    // IDEA so we explicitly disable it here.
+    if (frame_type != 3 &&
+        cpi->rd_filter_threshes[frame_type][1] >
+            cpi->rd_filter_threshes[frame_type][0] &&
+        cpi->rd_filter_threshes[frame_type][1] >
+            cpi->rd_filter_threshes[frame_type][2] &&
+        cpi->rd_filter_threshes[frame_type][1] >
+            cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
+      filter_type = vp9_switchable_interp[1];
+    } else if (cpi->rd_filter_threshes[frame_type][2] >
+            cpi->rd_filter_threshes[frame_type][0] &&
+        cpi->rd_filter_threshes[frame_type][2] >
+            cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
+      filter_type = vp9_switchable_interp[2];
+    } else if (cpi->rd_filter_threshes[frame_type][0] >
+                  cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
+      filter_type = vp9_switchable_interp[0];
+    } else {
+      filter_type = SWITCHABLE;
+    }
+
     /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */
 
     cpi->mb.e_mbd.lossless = 0;
     if (cpi->oxcf.lossless) {
-      txfm_type = ONLY_4X4;
       cpi->mb.e_mbd.lossless = 1;
-    } else
-#if 0
-    /* FIXME (rbultje): this code is disabled until we support cost updates
-     * while a frame is being encoded; the problem is that each time we
-     * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities
-     * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging
-     * further behind and not being chosen for subsequent frames either. This
-     * is essentially a local minimum problem that we can probably fix by
-     * estimating real costs more closely within a frame, perhaps by re-
-     * calculating costs on-the-fly as frame encoding progresses. */
-    if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
-            cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
-        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
-            cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
-        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
-            cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
-      txfm_type = TX_MODE_SELECT;
-    } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
-                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
-            && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
-                  cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
-               ) {
-      txfm_type = ONLY_4X4;
-    } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
-                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
-      txfm_type = ALLOW_16X16;
-    } else
-      txfm_type = ALLOW_8X8;
-#else
-    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >
-                  cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
-                    ALLOW_32X32 : TX_MODE_SELECT;
-#endif
-    cpi->common.txfm_mode = txfm_type;
+    }
+
+    select_tx_mode(cpi);
     cpi->common.comp_pred_mode = pred_type;
+    cpi->common.mcomp_filter_type = filter_type;
     encode_frame_internal(cpi);
 
     for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
-      const int diff = (int)(cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
+      const int diff = (int) (cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
       cpi->rd_prediction_type_threshes[frame_type][i] += diff;
       cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
     }
 
+    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+      const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs;
+      cpi->rd_filter_threshes[frame_type][i] =
+          (cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
+    }
+
     for (i = 0; i < NB_TXFM_MODES; ++i) {
       int64_t pd = cpi->rd_tx_select_diff[i];
       int diff;
       if (i == TX_MODE_SELECT)
         pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
                      2048 * (TX_SIZE_MAX_SB - 1), 0);
-      diff = (int)(pd / cpi->common.MBs);
+      diff = (int) (pd / cpi->common.MBs);
       cpi->rd_tx_select_threshes[frame_type][i] += diff;
       cpi->rd_tx_select_threshes[frame_type][i] /= 2;
     }
@@ -1860,64 +2405,47 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       }
     }
 
-    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
+    if (cpi->common.tx_mode == TX_MODE_SELECT) {
       int count4x4 = 0;
       int count8x8_lp = 0, count8x8_8x8p = 0;
       int count16x16_16x16p = 0, count16x16_lp = 0;
       int count32x32 = 0;
 
-      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
-        count4x4 += cm->fc.tx_count_32x32p[i][TX_4X4];
-      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
-        count4x4 += cm->fc.tx_count_16x16p[i][TX_4X4];
-      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
-        count4x4 += cm->fc.tx_count_8x8p[i][TX_4X4];
-
-      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
-        count8x8_lp += cm->fc.tx_count_32x32p[i][TX_8X8];
-      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
-        count8x8_lp += cm->fc.tx_count_16x16p[i][TX_8X8];
+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+        count4x4 += cm->counts.tx.p32x32[i][TX_4X4];
+        count4x4 += cm->counts.tx.p16x16[i][TX_4X4];
+        count4x4 += cm->counts.tx.p8x8[i][TX_4X4];
 
-      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
-        count8x8_8x8p += cm->fc.tx_count_8x8p[i][TX_8X8];
+        count8x8_lp += cm->counts.tx.p32x32[i][TX_8X8];
+        count8x8_lp += cm->counts.tx.p16x16[i][TX_8X8];
+        count8x8_8x8p += cm->counts.tx.p8x8[i][TX_8X8];
 
-      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
-        count16x16_16x16p += cm->fc.tx_count_16x16p[i][TX_16X16];
-
-      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
-        count16x16_lp += cm->fc.tx_count_32x32p[i][TX_16X16];
-
-      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
-        count32x32 += cm->fc.tx_count_32x32p[i][TX_32X32];
+        count16x16_16x16p += cm->counts.tx.p16x16[i][TX_16X16];
+        count16x16_lp += cm->counts.tx.p32x32[i][TX_16X16];
+        count32x32 += cm->counts.tx.p32x32[i][TX_32X32];
+      }
 
-      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
-          count32x32 == 0) {
-        cpi->common.txfm_mode = ALLOW_8X8;
+      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0
+          && count32x32 == 0) {
+        cpi->common.tx_mode = ALLOW_8X8;
         reset_skip_txfm_size(cpi, TX_8X8);
-      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
-                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
-        cpi->common.txfm_mode = ONLY_4X4;
+      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0
+                 && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+        cpi->common.tx_mode = ONLY_4X4;
         reset_skip_txfm_size(cpi, TX_4X4);
       } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
-        cpi->common.txfm_mode = ALLOW_32X32;
+        cpi->common.tx_mode = ALLOW_32X32;
       } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
-        cpi->common.txfm_mode = ALLOW_16X16;
+        cpi->common.tx_mode = ALLOW_16X16;
         reset_skip_txfm_size(cpi, TX_16X16);
       }
     }
-
-    // Update interpolation filter strategy for next frame.
-    if ((cpi->common.frame_type != KEY_FRAME) && (cpi->sf.search_best_filter))
-      vp9_select_interp_filter_type(cpi);
   } else {
     encode_frame_internal(cpi);
   }
 
 }
 
-void vp9_build_block_offsets(MACROBLOCK *x) {
-}
-
 static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
   const MACROBLOCKD *xd = &x->e_mbd;
   const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
@@ -1931,11 +2459,13 @@ static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
     ++cpi->y_mode_count[MIN(bsl, 3)][m];
   } else {
     int idx, idy;
-    int bw = 1 << b_width_log2(xd->mode_info_context->mbmi.sb_type);
-    int bh = 1 << b_height_log2(xd->mode_info_context->mbmi.sb_type);
-    for (idy = 0; idy < 2; idy += bh) {
-      for (idx = 0; idx < 2; idx += bw) {
-        int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode.first;
+    int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[
+      xd->mode_info_context->mbmi.sb_type];
+    int num_4x4_blocks_high = num_4x4_blocks_high_lookup[
+      xd->mode_info_context->mbmi.sb_type];
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+        int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode;
         ++cpi->y_mode_count[0][m];
       }
     }
@@ -1957,26 +2487,28 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
   b = 4 * act + cpi->activity_avg;
 
   if (act > cpi->activity_avg)
-    x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1;
+    x->act_zbin_adj = (int) (((int64_t) b + (a >> 1)) / a) - 1;
   else
-    x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b);
+    x->act_zbin_adj = 1 - (int) (((int64_t) a + (b >> 1)) / b);
 #endif
 }
 
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int output_enabled, int mi_row, int mi_col,
-                              BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
+static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+                              int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   MODE_INFO *mi = xd->mode_info_context;
   MB_MODE_INFO *mbmi = &mi->mbmi;
   unsigned int segment_id = mbmi->segment_id;
   const int mis = cm->mode_info_stride;
-  const int bwl = mi_width_log2(bsize);
-  const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize);
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
   x->rd_search = 0;
+  x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
+                    xd->q_index < QIDX_SKIP_THRESH);
+  if (x->skip_encode)
+    return;
 
   if (cm->frame_type == KEY_FRAME) {
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
@@ -2015,10 +2547,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
   }
 
   if (mbmi->ref_frame[0] == INTRA_FRAME) {
-    vp9_encode_intra_block_y(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?
-                                    BLOCK_SIZE_SB8X8 : bsize);
-    vp9_encode_intra_block_uv(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?
-                                     BLOCK_SIZE_SB8X8 : bsize);
+    vp9_encode_intra_block_y(
+        cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+    vp9_encode_intra_block_uv(
+        cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
     if (output_enabled)
       sum_intra_stats(cpi, x);
   } else {
@@ -2032,58 +2564,51 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
 
     assert(cm->frame_type != KEY_FRAME);
 
-    setup_pre_planes(xd, ref_fb, second_ref_fb,
-                     mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
+    setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col,
+                     &xd->scale_factor[0]);
+    setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
+                     &xd->scale_factor[1]);
+
 
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col,
-                                  bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8
-                                                           : bsize);
+    vp9_build_inter_predictors_sb(
+        xd, mi_row, mi_col,
+        bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8 : bsize);
   }
 
   if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
-    vp9_tokenize_sb(cpi, xd, t, !output_enabled,
+    vp9_tokenize_sb(cpi, t, !output_enabled,
                     (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
   } else if (!x->skip) {
     vp9_encode_sb(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
-    vp9_tokenize_sb(cpi, xd, t, !output_enabled,
+    vp9_tokenize_sb(cpi, t, !output_enabled,
                     (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
   } else {
-    // FIXME(rbultje): not tile-aware (mi - 1)
-    int mb_skip_context =
-        (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff;
+    int mb_skip_context = xd->left_available ? (mi - 1)->mbmi.mb_skip_coeff : 0;
+    mb_skip_context += (mi - mis)->mbmi.mb_skip_coeff;
 
     mbmi->mb_skip_coeff = 1;
     if (output_enabled)
-      cm->fc.mbskip_count[mb_skip_context][1]++;
-    vp9_reset_sb_tokens_context(xd,
-                 (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+      cm->counts.mbskip[mb_skip_context][1]++;
+    vp9_reset_sb_tokens_context(
+        xd, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
   }
 
   // copy skip flag on all mb_mode_info contexts in this SB
   // if this was a skip at this txfm size
-  for (n = 1; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    if (mi_col + x_idx < cm->mi_cols && mi_row + y_idx < cm->mi_rows)
-      mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
-  }
+  vp9_set_pred_flag_mbskip(cm, bsize, mi_row, mi_col, mi->mbmi.mb_skip_coeff);
 
   if (output_enabled) {
-    if (cm->txfm_mode == TX_MODE_SELECT &&
-        mbmi->sb_type >= BLOCK_SIZE_SB8X8 &&
-        !(mbmi->ref_frame[0] != INTRA_FRAME && (mbmi->mb_skip_coeff ||
-          vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
-      const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);
-      if (bsize >= BLOCK_SIZE_SB32X32) {
-        cm->fc.tx_count_32x32p[context][mbmi->txfm_size]++;
-      } else if (bsize >= BLOCK_SIZE_MB16X16) {
-        cm->fc.tx_count_16x16p[context][mbmi->txfm_size]++;
-      } else {
-        cm->fc.tx_count_8x8p[context][mbmi->txfm_size]++;
-      }
+    if (cm->tx_mode == TX_MODE_SELECT &&
+        mbmi->sb_type >= BLOCK_SIZE_SB8X8  &&
+        !(mbmi->ref_frame[0] != INTRA_FRAME &&
+            (mbmi->mb_skip_coeff ||
+             vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)))) {
+      const uint8_t context = vp9_get_pred_context_tx_size(xd);
+      update_tx_counts(bsize, context, mbmi->txfm_size, &cm->counts.tx);
     } else {
       int x, y;
-      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode;
-       // The new intra coding scheme requires no change of transform size
+      TX_SIZE sz = (cm->tx_mode == TX_MODE_SELECT) ? TX_32X32 : cm->tx_mode;
+      // The new intra coding scheme requires no change of transform size
       if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
         if (sz == TX_32X32 && bsize < BLOCK_SIZE_SB32X32)
           sz = TX_16X16;
@@ -2097,8 +2622,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
         sz = TX_4X4;
       }
 
-      for (y = 0; y < bh; y++) {
-        for (x = 0; x < bw; x++) {
+      for (y = 0; y < mi_height; y++) {
+        for (x = 0; x < mi_width; x++) {
           if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) {
             mi[mis * y + x].mbmi.txfm_size = sz;
           }
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.h b/libvpx/vp9/encoder/vp9_encodeframe.h
index d37bdca..3991969 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.h
+++ b/libvpx/vp9/encoder/vp9_encodeframe.h
@@ -15,8 +15,6 @@
 struct macroblock;
 struct yv12_buffer_config;
 
-void vp9_build_block_offsets(struct macroblock *x);
-
 void vp9_setup_src_planes(struct macroblock *x,
                           const struct yv12_buffer_config *src,
                           int mb_row, int mb_col);
diff --git a/libvpx/vp9/encoder/vp9_encodeintra.c b/libvpx/vp9/encoder/vp9_encodeintra.c
index f29dba0..d49e532 100644
--- a/libvpx/vp9/encoder/vp9_encodeintra.c
+++ b/libvpx/vp9/encoder/vp9_encodeintra.c
@@ -18,15 +18,11 @@
 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   (void) cpi;
+  x->skip_encode = 0;
   mbmi->mode = DC_PRED;
   mbmi->ref_frame[0] = INTRA_FRAME;
-  if (use_16x16_pred) {
-    mbmi->txfm_size = mbmi->sb_type >= BLOCK_SIZE_MB16X16 ? TX_16X16 : TX_8X8;
-    vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
-  } else {
-    mbmi->txfm_size = TX_4X4;
-    vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
-  }
-
+  mbmi->txfm_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_SIZE_MB16X16 ?
+                                     TX_16X16 : TX_8X8) : TX_4X4;
+  vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
   return vp9_get_mb_ss(x->plane[0].src_diff);
 }
diff --git a/libvpx/vp9/encoder/vp9_encodeintra.h b/libvpx/vp9/encoder/vp9_encodeintra.h
index 14d144b..16ac59e 100644
--- a/libvpx/vp9/encoder/vp9_encodeintra.h
+++ b/libvpx/vp9/encoder/vp9_encodeintra.h
@@ -14,6 +14,8 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 
 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
+void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                               int ss_txfrm_size, void *arg);
 void vp9_encode_intra_block_y(VP9_COMMON *const cm, MACROBLOCK *mb,
                               BLOCK_SIZE_TYPE bs);
 void vp9_encode_intra_block_uv(VP9_COMMON *const cm, MACROBLOCK *mb,
diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index 4f45496..66e35a9 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c
@@ -22,10 +22,10 @@
 DECLARE_ALIGNED(16, extern const uint8_t,
                 vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 
-void vp9_subtract_block(int rows, int cols,
-                        int16_t *diff_ptr, int diff_stride,
-                        const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *pred_ptr, int pred_stride) {
+void vp9_subtract_block_c(int rows, int cols,
+                          int16_t *diff_ptr, ptrdiff_t diff_stride,
+                          const uint8_t *src_ptr, ptrdiff_t src_stride,
+                          const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   int r, c;
 
   for (r = 0; r < rows; r++) {
@@ -78,7 +78,6 @@ void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
 
 
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
-#define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 typedef struct vp9_token_state vp9_token_state;
 
 struct vp9_token_state {
@@ -110,14 +109,13 @@ static const int plane_rd_mult[4] = {
 
 // This function is a place holder for now but may ultimately need
 // to scan previous tokens to work out the correct context.
-static int trellis_get_coeff_context(const int *scan,
-                                     const int *nb,
+static int trellis_get_coeff_context(const int16_t *scan,
+                                     const int16_t *nb,
                                      int idx, int token,
-                                     uint8_t *token_cache,
-                                     int pad, int l) {
+                                     uint8_t *token_cache) {
   int bak = token_cache[scan[idx]], pt;
   token_cache[scan[idx]] = vp9_pt_energy_class[token];
-  pt = vp9_get_coef_context(scan, nb, pad, token_cache, idx + 1, l);
+  pt = get_coef_context(nb, token_cache, idx + 1);
   token_cache[scan[idx]] = bak;
   return pt;
 }
@@ -142,8 +140,8 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
   int best, band, pt;
   PLANE_TYPE type = xd->plane[plane].plane_type;
   int err_mult = plane_rd_mult[type];
-  int default_eob, pad;
-  int const *scan, *nb;
+  int default_eob;
+  const int16_t *scan, *nb;
   const int mul = 1 + (tx_size == TX_32X32);
   uint8_t token_cache[1024];
   const int ib = txfrm_block_to_raster_block(xd, bsize, plane,
@@ -156,27 +154,21 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
   qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
   switch (tx_size) {
     default:
-    case TX_4X4: {
-      const TX_TYPE tx_type = plane == 0 ? get_tx_type_4x4(xd, ib) : DCT_DCT;
+    case TX_4X4:
       default_eob = 16;
-      scan = get_scan_4x4(tx_type);
+      scan = get_scan_4x4(get_tx_type_4x4(type, xd, ib));
       band_translate = vp9_coefband_trans_4x4;
       break;
-    }
-    case TX_8X8: {
-      const TX_TYPE tx_type = plane == 0 ? get_tx_type_8x8(xd, ib) : DCT_DCT;
-      scan = get_scan_8x8(tx_type);
+    case TX_8X8:
+      scan = get_scan_8x8(get_tx_type_8x8(type, xd));
       default_eob = 64;
       band_translate = vp9_coefband_trans_8x8plus;
       break;
-    }
-    case TX_16X16: {
-      const TX_TYPE tx_type = plane == 0 ? get_tx_type_16x16(xd, ib) : DCT_DCT;
-      scan = get_scan_16x16(tx_type);
+    case TX_16X16:
+      scan = get_scan_16x16(get_tx_type_16x16(type, xd));
       default_eob = 256;
       band_translate = vp9_coefband_trans_8x8plus;
       break;
-    }
     case TX_32X32:
       scan = vp9_default_scan_32x32;
       default_eob = 1024;
@@ -190,7 +182,6 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
   if (mb->e_mbd.mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
     rdmult = (rdmult * 9) >> 4;
   rddiv = mb->rddiv;
-  memset(best_index, 0, sizeof(best_index));
   /* Initialize the sentinel node of the trellis. */
   tokens[eob][0].rate = 0;
   tokens[eob][0].error = 0;
@@ -202,7 +193,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
   for (i = 0; i < eob; i++)
     token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
         qcoeff_ptr[scan[i]]].token];
-  nb = vp9_get_coef_neighbors_handle(scan, &pad);
+  nb = vp9_get_coef_neighbors_handle(scan);
 
   for (i = eob; i-- > i0;) {
     int base_bits, d2, dx;
@@ -221,14 +212,13 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
       /* Consider both possible successor states. */
       if (next < default_eob) {
         band = get_coef_band(band_translate, i + 1);
-        pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
-                                       pad, default_eob);
+        pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
         rate0 +=
-          mb->token_costs_noskip[tx_size][type][ref][band][pt]
-                                [tokens[next][0].token];
+          mb->token_costs[tx_size][type][ref][0][band][pt]
+                         [tokens[next][0].token];
         rate1 +=
-          mb->token_costs_noskip[tx_size][type][ref][band][pt]
-                                [tokens[next][1].token];
+          mb->token_costs[tx_size][type][ref][0][band][pt]
+                         [tokens[next][1].token];
       }
       UPDATE_RD_COST();
       /* And pick the best. */
@@ -274,24 +264,14 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
       if (next < default_eob) {
         band = get_coef_band(band_translate, i + 1);
         if (t0 != DCT_EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
-                                         pad, default_eob);
-          if (!x)
-            rate0 += mb->token_costs[tx_size][type][ref][band][pt][
-                tokens[next][0].token];
-          else
-            rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][
-                tokens[next][0].token];
+          pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
+          rate0 += mb->token_costs[tx_size][type][ref][!x][band][pt]
+                                  [tokens[next][0].token];
         }
         if (t1 != DCT_EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache,
-                                         pad, default_eob);
-          if (!x)
-            rate1 += mb->token_costs[tx_size][type][ref][band][pt][
-                tokens[next][1].token];
-          else
-            rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][
-                tokens[next][1].token];
+          pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
+          rate1 += mb->token_costs[tx_size][type][ref][!x][band][pt]
+                                  [tokens[next][1].token];
         }
       }
 
@@ -323,14 +303,15 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
       /* Update the cost of each path if we're past the EOB token. */
       if (t0 != DCT_EOB_TOKEN) {
         tokens[next][0].rate +=
-            mb->token_costs[tx_size][type][ref][band][0][t0];
+            mb->token_costs[tx_size][type][ref][1][band][0][t0];
         tokens[next][0].token = ZERO_TOKEN;
       }
       if (t1 != DCT_EOB_TOKEN) {
         tokens[next][1].rate +=
-            mb->token_costs[tx_size][type][ref][band][0][t1];
+            mb->token_costs[tx_size][type][ref][1][band][0][t1];
         tokens[next][1].token = ZERO_TOKEN;
       }
+      best_index[i][0] = best_index[i][1] = 0;
       /* Don't update next, because we didn't add a new node. */
     }
   }
@@ -344,8 +325,8 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
   error1 = tokens[next][1].error;
   t0 = tokens[next][0].token;
   t1 = tokens[next][1].token;
-  rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t0];
-  rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t1];
+  rate0 += mb->token_costs[tx_size][type][ref][0][band][pt][t0];
+  rate1 += mb->token_costs[tx_size][type][ref][0][band][pt][t1];
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = i0 - 1;
@@ -369,12 +350,6 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
   *a = *l = (final_eob > 0);
 }
 
-struct optimize_block_args {
-  VP9_COMMON *cm;
-  MACROBLOCK *x;
-  struct optimize_ctx *ctx;
-};
-
 void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
                     int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *mb,
                     struct optimize_ctx *ctx) {
@@ -390,7 +365,7 @@ void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
 
 static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                            int ss_txfrm_size, void *arg) {
-  const struct optimize_block_args* const args = arg;
+  const struct encode_b_args* const args = arg;
   vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, args->x,
                  args->ctx);
 }
@@ -427,7 +402,7 @@ void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
 
 void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
   struct optimize_ctx ctx;
-  struct optimize_block_args arg = {cm, x, &ctx};
+  struct encode_b_args arg = {cm, x, &ctx};
   vp9_optimize_init(&x->e_mbd, bsize, &ctx);
   foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, optimize_block, &arg);
 }
@@ -435,64 +410,83 @@ void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
 void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                        BLOCK_SIZE_TYPE bsize) {
   struct optimize_ctx ctx;
-  struct optimize_block_args arg = {cm, x, &ctx};
+  struct encode_b_args arg = {cm, x, &ctx};
   vp9_optimize_init(&x->e_mbd, bsize, &ctx);
   foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg);
 }
 
-struct encode_b_args {
-  VP9_COMMON *cm;
-  MACROBLOCK *x;
-  struct optimize_ctx *ctx;
-};
-
-static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                         int ss_txfrm_size, void *arg) {
+void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                 int ss_txfrm_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
   MACROBLOCKD* const xd = &x->e_mbd;
-  const int bw = plane_block_width(bsize, &xd->plane[plane]);
-  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
-                                                       block, ss_txfrm_size);
-  int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block, 16);
-  int16_t *const src_diff = raster_block_offset_int16(xd, bsize, plane,
-                                                      raster_block,
-                                                      x->plane[plane].src_diff);
-  TX_TYPE tx_type = DCT_DCT;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16);
+  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16);
+  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+  const int16_t *scan, *iscan;
+  uint16_t *eob = &pd->eobs[block];
+  const int bwl = plane_block_width_log2by4(bsize, pd), bw = 1 << bwl;
+  const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
+  int xoff, yoff;
+  int16_t *src_diff;
 
-  switch (ss_txfrm_size / 2) {
+  switch (tx_size) {
     case TX_32X32:
+      scan = vp9_default_scan_32x32;
+      iscan = vp9_default_iscan_32x32;
+      block >>= 6;
+      xoff = 32 * (block & twmask);
+      yoff = 32 * (block >> twl);
+      src_diff = p->src_diff + 4 * bw * yoff + xoff;
       if (x->rd_search)
-        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 2);
+        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
       else
-        vp9_short_fdct32x32(src_diff, coeff, bw * 2);
+        vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff, dqcoeff,
+                           pd->dequant, p->zbin_extra, eob, scan, iscan);
       break;
     case TX_16X16:
-      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
-      if (tx_type != DCT_DCT)
-        vp9_short_fht16x16(src_diff, coeff, bw, tx_type);
-      else
-        x->fwd_txm16x16(src_diff, coeff, bw * 2);
+      scan = vp9_default_scan_16x16;
+      iscan = vp9_default_iscan_16x16;
+      block >>= 4;
+      xoff = 16 * (block & twmask);
+      yoff = 16 * (block >> twl);
+      src_diff = p->src_diff + 4 * bw * yoff + xoff;
+      x->fwd_txm16x16(src_diff, coeff, bw * 8);
+      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, p->zbin_extra, eob, scan, iscan);
       break;
     case TX_8X8:
-      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
-      if (tx_type != DCT_DCT)
-        vp9_short_fht8x8(src_diff, coeff, bw, tx_type);
-      else
-        x->fwd_txm8x8(src_diff, coeff, bw * 2);
+      scan = vp9_default_scan_8x8;
+      iscan = vp9_default_iscan_8x8;
+      block >>= 2;
+      xoff = 8 * (block & twmask);
+      yoff = 8 * (block >> twl);
+      src_diff = p->src_diff + 4 * bw * yoff + xoff;
+      x->fwd_txm8x8(src_diff, coeff, bw * 8);
+      vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, p->zbin_extra, eob, scan, iscan);
       break;
     case TX_4X4:
-      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
-      if (tx_type != DCT_DCT)
-        vp9_short_fht4x4(src_diff, coeff, bw, tx_type);
-      else
-        x->fwd_txm4x4(src_diff, coeff, bw * 2);
+      scan = vp9_default_scan_4x4;
+      iscan = vp9_default_iscan_4x4;
+      xoff = 4 * (block & twmask);
+      yoff = 4 * (block >> twl);
+      src_diff = p->src_diff + 4 * bw * yoff + xoff;
+      x->fwd_txm4x4(src_diff, coeff, bw * 8);
+      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, p->zbin_extra, eob, scan, iscan);
       break;
     default:
       assert(0);
   }
-
-  vp9_quantize(x, plane, block, 16 << ss_txfrm_size, tx_type);
 }
 
 static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -507,41 +501,32 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
   uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane,
                                                  raster_block,
                                                  pd->dst.buf, pd->dst.stride);
-  TX_TYPE tx_type = DCT_DCT;
-
   xform_quant(plane, block, bsize, ss_txfrm_size, arg);
 
   if (x->optimize)
     vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, x, args->ctx);
 
+  if (x->skip_encode)
+    return;
+  if (pd->eobs[block] == 0)
+    return;
+
   switch (ss_txfrm_size / 2) {
     case TX_32X32:
       vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
       break;
     case TX_16X16:
-      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
-      if (tx_type == DCT_DCT)
-        vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
-      else
-        vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
+      vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
       break;
     case TX_8X8:
-      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
-      if (tx_type == DCT_DCT)
-        vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
-      else
-        vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
+      vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
       break;
     case TX_4X4:
-      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
-      if (tx_type == DCT_DCT)
-        // this is like vp9_short_idct4x4 but has a special case around eob<=1
-        // which is significant (not just an optimization) for the lossless
-        // case.
-        inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
-                                    dst, pd->dst.stride);
-      else
-        vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
+      // this is like vp9_short_idct4x4 but has a special case around eob<=1
+      // which is significant (not just an optimization) for the lossless
+      // case.
+      inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
+                                  dst, pd->dst.stride);
       break;
   }
 }
@@ -597,92 +582,157 @@ void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
   foreach_transformed_block(xd, bsize, encode_block, &arg);
 }
 
-static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                               int ss_txfrm_size, void *arg) {
+void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                        int ss_txfrm_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
-  const int bw = plane_block_width(bsize, pd);
-  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
-                                                       block, ss_txfrm_size);
-
-  uint8_t *const src = raster_block_offset_uint8(xd, bsize, plane, raster_block,
-                                                 p->src.buf, p->src.stride);
-  uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane, raster_block,
-                                                 pd->dst.buf, pd->dst.stride);
-  int16_t *const src_diff = raster_block_offset_int16(xd, bsize, plane,
-                                                      raster_block,
-                                                      p->src_diff);
-
-  const int txfm_b_size = 4 << tx_size;
-  int ib = raster_block;
-  int tx_ib = ib >> tx_size;
-  int plane_b_size;
-
+  int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16);
+  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16);
+  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+  const int16_t *scan, *iscan;
   TX_TYPE tx_type;
-  int mode, b_mode;
+  MB_PREDICTION_MODE mode;
+  const int bwl = b_width_log2(bsize) - pd->subsampling_x, bw = 1 << bwl;
+  const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
+  int xoff, yoff;
+  uint8_t *src, *dst;
+  int16_t *src_diff;
+  uint16_t *eob = &pd->eobs[block];
 
   if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
     extend_for_intra(xd, plane, block, bsize, ss_txfrm_size);
   }
 
-  mode = plane == 0? mbmi->mode: mbmi->uv_mode;
-  if (plane == 0 &&
-      mbmi->sb_type < BLOCK_SIZE_SB8X8 &&
-      mbmi->ref_frame[0] == INTRA_FRAME)
-    b_mode = xd->mode_info_context->bmi[ib].as_mode.first;
-  else
-    b_mode = mode;
-
-  assert(b_mode >= DC_PRED && b_mode <= TM_PRED);
-
-  plane_b_size = b_width_log2(bsize) - pd->subsampling_x;
-  vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
-                          dst, pd->dst.stride);
-  vp9_subtract_block(txfm_b_size, txfm_b_size, src_diff, bw,
-                     src, p->src.stride, dst, pd->dst.stride);
-
-  xform_quant(plane, block, bsize, ss_txfrm_size, arg);
-
-
   // if (x->optimize)
   // vp9_optimize_b(plane, block, bsize, ss_txfrm_size,
   //                args->cm, x, args->ctx);
 
-  switch (ss_txfrm_size / 2) {
+  switch (tx_size) {
     case TX_32X32:
+      scan = vp9_default_scan_32x32;
+      iscan = vp9_default_iscan_32x32;
+      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+      block >>= 6;
+      xoff = 32 * (block & twmask);
+      yoff = 32 * (block >> twl);
+      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
+      src = p->src.buf + yoff * p->src.stride + xoff;
+      src_diff = p->src_diff + 4 * bw * yoff + xoff;
+      vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode,
+                              dst, pd->dst.stride, dst, pd->dst.stride);
+      vp9_subtract_block(32, 32, src_diff, bw * 4,
+                         src, p->src.stride, dst, pd->dst.stride);
+      if (x->rd_search)
+        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
+      else
+        vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff, dqcoeff,
+                           pd->dequant, p->zbin_extra, eob, scan, iscan);
+      if (!x->skip_encode && *eob)
         vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
       break;
     case TX_16X16:
-      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
-      if (tx_type == DCT_DCT)
-        vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
+      tx_type = get_tx_type_16x16(pd->plane_type, xd);
+      scan = get_scan_16x16(tx_type);
+      iscan = get_iscan_16x16(tx_type);
+      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+      block >>= 4;
+      xoff = 16 * (block & twmask);
+      yoff = 16 * (block >> twl);
+      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
+      src = p->src.buf + yoff * p->src.stride + xoff;
+      src_diff = p->src_diff + 4 * bw * yoff + xoff;
+      vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode,
+                              dst, pd->dst.stride, dst, pd->dst.stride);
+      vp9_subtract_block(16, 16, src_diff, bw * 4,
+                         src, p->src.stride, dst, pd->dst.stride);
+      if (tx_type != DCT_DCT)
+        vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
       else
-        vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
+        x->fwd_txm16x16(src_diff, coeff, bw * 8);
+      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
+                     p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+      if (!x->skip_encode && *eob) {
+        if (tx_type == DCT_DCT)
+          vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
+        else
+          vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
+      }
       break;
     case TX_8X8:
-      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
-      if (tx_type == DCT_DCT)
-        vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
+      tx_type = get_tx_type_8x8(pd->plane_type, xd);
+      scan = get_scan_8x8(tx_type);
+      iscan = get_iscan_8x8(tx_type);
+      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+      block >>= 2;
+      xoff = 8 * (block & twmask);
+      yoff = 8 * (block >> twl);
+      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
+      src = p->src.buf + yoff * p->src.stride + xoff;
+      src_diff = p->src_diff + 4 * bw * yoff + xoff;
+      vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode,
+                              dst, pd->dst.stride, dst, pd->dst.stride);
+      vp9_subtract_block(8, 8, src_diff, bw * 4,
+                         src, p->src.stride, dst, pd->dst.stride);
+      if (tx_type != DCT_DCT)
+        vp9_short_fht8x8(src_diff, coeff, bw * 4, tx_type);
       else
-        vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
+        x->fwd_txm8x8(src_diff, coeff, bw * 8);
+      vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
+                     p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+      if (!x->skip_encode && *eob) {
+        if (tx_type == DCT_DCT)
+          vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
+        else
+          vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
+      }
       break;
     case TX_4X4:
-      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
-      if (tx_type == DCT_DCT)
-        // this is like vp9_short_idct4x4 but has a special case around eob<=1
-        // which is significant (not just an optimization) for the lossless
-        // case.
-        inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
-                                    dst, pd->dst.stride);
+      tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
+      scan = get_scan_4x4(tx_type);
+      iscan = get_iscan_4x4(tx_type);
+      if (mbmi->sb_type < BLOCK_SIZE_SB8X8 && plane == 0) {
+        mode = xd->mode_info_context->bmi[block].as_mode;
+      } else {
+        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+      }
+      xoff = 4 * (block & twmask);
+      yoff = 4 * (block >> twl);
+      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
+      src = p->src.buf + yoff * p->src.stride + xoff;
+      src_diff = p->src_diff + 4 * bw * yoff + xoff;
+      vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
+                              dst, pd->dst.stride, dst, pd->dst.stride);
+      vp9_subtract_block(4, 4, src_diff, bw * 4,
+                         src, p->src.stride, dst, pd->dst.stride);
+      if (tx_type != DCT_DCT)
+        vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
       else
-        vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
+        x->fwd_txm4x4(src_diff, coeff, bw * 8);
+      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
+                     p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+      if (!x->skip_encode && *eob) {
+        if (tx_type == DCT_DCT)
+          // this is like vp9_short_idct4x4 but has a special case around eob<=1
+          // which is significant (not just an optimization) for the lossless
+          // case.
+          inverse_transform_b_4x4_add(xd, *eob, dqcoeff,
+                                      dst, pd->dst.stride);
+        else
+          vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
+      }
       break;
+    default:
+      assert(0);
   }
 }
 
diff --git a/libvpx/vp9/encoder/vp9_encodemb.h b/libvpx/vp9/encoder/vp9_encodemb.h
index 5796903..defaa48 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/libvpx/vp9/encoder/vp9_encodemb.h
@@ -27,6 +27,12 @@ struct optimize_ctx {
   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
 };
 
+struct encode_b_args {
+  VP9_COMMON *cm;
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+};
+
 void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
                        struct optimize_ctx *ctx);
 void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -39,13 +45,11 @@ void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 
+void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                 int ss_txfrm_size, void *arg);
 void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 
-void vp9_subtract_block(int rows, int cols,
-                        int16_t *diff_ptr, int diff_stride,
-                        const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *pred_ptr, int pred_stride);
 void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize);
diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c
index a582d18..2f5e16c 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/libvpx/vp9/encoder/vp9_encodemv.c
@@ -128,111 +128,93 @@ static void build_nmv_component_cost_table(int *mvcost,
   }
 }
 
-static int update_nmv_savings(const unsigned int ct[2],
-                              const vp9_prob cur_p,
-                              const vp9_prob new_p,
-                              const vp9_prob upd_p) {
-
-#ifdef LOW_PRECISION_MV_UPDATE
-  vp9_prob mod_p = new_p | 1;
-#else
-  vp9_prob mod_p = new_p;
-#endif
-  const int cur_b = cost_branch256(ct, cur_p);
-  const int mod_b = cost_branch256(ct, mod_p);
-  const int cost = 7 * 256 +
-#ifndef LOW_PRECISION_MV_UPDATE
-      256 +
-#endif
-      (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
-  if (cur_b - mod_b - cost > 0) {
-    return cur_b - mod_b - cost;
-  } else {
-    return 0 - vp9_cost_zero(upd_p);
-  }
-}
-
-static int update_nmv(
-  vp9_writer *const bc,
-  const unsigned int ct[2],
-  vp9_prob *const cur_p,
-  const vp9_prob new_p,
-  const vp9_prob upd_p) {
-
-#ifdef LOW_PRECISION_MV_UPDATE
+static int update_mv(vp9_writer *w, const unsigned int ct[2],
+                     vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) {
   vp9_prob mod_p = new_p | 1;
-#else
-  vp9_prob mod_p = new_p;
-#endif
-
   const int cur_b = cost_branch256(ct, *cur_p);
   const int mod_b = cost_branch256(ct, mod_p);
-  const int cost = 7 * 256 +
-#ifndef LOW_PRECISION_MV_UPDATE
-      256 +
-#endif
-      (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
-
+  const int cost = 7 * 256 + (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
   if (cur_b - mod_b > cost) {
     *cur_p = mod_p;
-    vp9_write(bc, 1, upd_p);
-#ifdef LOW_PRECISION_MV_UPDATE
-    vp9_write_literal(bc, mod_p >> 1, 7);
-#else
-    vp9_write_literal(bc, mod_p, 8);
-#endif
+    vp9_write(w, 1, upd_p);
+    vp9_write_literal(w, mod_p >> 1, 7);
     return 1;
   } else {
-    vp9_write(bc, 0, upd_p);
+    vp9_write(w, 0, upd_p);
     return 0;
   }
 }
 
-void print_nmvcounts(nmv_context_counts tnmvcounts) {
+static void counts_to_nmv_context(
+    nmv_context_counts *nmv_count,
+    nmv_context *prob,
+    int usehp,
+    unsigned int (*branch_ct_joint)[2],
+    unsigned int (*branch_ct_sign)[2],
+    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
+    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
+    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
+    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
+    unsigned int (*branch_ct_fp)[4 - 1][2],
+    unsigned int (*branch_ct_class0_hp)[2],
+    unsigned int (*branch_ct_hp)[2]) {
   int i, j, k;
-  printf("\nCounts =\n  { ");
-  for (j = 0; j < MV_JOINTS; ++j)
-    printf("%d, ", tnmvcounts.joints[j]);
-  printf("},\n");
+  vp9_counts_process(nmv_count, usehp);
+  vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
+                                   prob->joints,
+                                   branch_ct_joint,
+                                   nmv_count->joints, 0);
   for (i = 0; i < 2; ++i) {
-    printf("  {\n");
-    printf("    %d/%d,\n", tnmvcounts.comps[i].sign[0],
-                           tnmvcounts.comps[i].sign[1]);
-    printf("    { ");
-    for (j = 0; j < MV_CLASSES; ++j)
-      printf("%d, ", tnmvcounts.comps[i].classes[j]);
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < CLASS0_SIZE; ++j)
-      printf("%d, ", tnmvcounts.comps[i].class0[j]);
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < MV_OFFSET_BITS; ++j)
-      printf("%d/%d, ", tnmvcounts.comps[i].bits[j][0],
-                        tnmvcounts.comps[i].bits[j][1]);
-    printf("},\n");
+    const uint32_t s0 = nmv_count->comps[i].sign[0];
+    const uint32_t s1 = nmv_count->comps[i].sign[1];
+
+    prob->comps[i].sign = get_binary_prob(s0, s1);
+    branch_ct_sign[i][0] = s0;
+    branch_ct_sign[i][1] = s1;
+    vp9_tree_probs_from_distribution(vp9_mv_class_tree,
+                                     prob->comps[i].classes,
+                                     branch_ct_classes[i],
+                                     nmv_count->comps[i].classes, 0);
+    vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
+                                     prob->comps[i].class0,
+                                     branch_ct_class0[i],
+                                     nmv_count->comps[i].class0, 0);
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      const uint32_t b0 = nmv_count->comps[i].bits[j][0];
+      const uint32_t b1 = nmv_count->comps[i].bits[j][1];
 
-    printf("    {");
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      printf("{");
-      for (k = 0; k < 4; ++k)
-        printf("%d, ", tnmvcounts.comps[i].class0_fp[j][k]);
-      printf("}, ");
+      prob->comps[i].bits[j] = get_binary_prob(b0, b1);
+      branch_ct_bits[i][j][0] = b0;
+      branch_ct_bits[i][j][1] = b1;
+    }
+  }
+  for (i = 0; i < 2; ++i) {
+    for (k = 0; k < CLASS0_SIZE; ++k) {
+      vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
+                                       prob->comps[i].class0_fp[k],
+                                       branch_ct_class0_fp[i][k],
+                                       nmv_count->comps[i].class0_fp[k], 0);
+    }
+    vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
+                                     prob->comps[i].fp,
+                                     branch_ct_fp[i],
+                                     nmv_count->comps[i].fp, 0);
+  }
+  if (usehp) {
+    for (i = 0; i < 2; ++i) {
+      const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0];
+      const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1];
+      const uint32_t hp0 = nmv_count->comps[i].hp[0];
+      const uint32_t hp1 = nmv_count->comps[i].hp[1];
+
+      prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
+      branch_ct_class0_hp[i][0] = c0_hp0;
+      branch_ct_class0_hp[i][1] = c0_hp1;
+
+      prob->comps[i].hp = get_binary_prob(hp0, hp1);
+      branch_ct_hp[i][0] = hp0;
+      branch_ct_hp[i][1] = hp1;
     }
-    printf("},\n");
-
-    printf("    { ");
-    for (j = 0; j < 4; ++j)
-      printf("%d, ", tnmvcounts.comps[i].fp[j]);
-    printf("},\n");
-
-    printf("    %d/%d,\n",
-           tnmvcounts.comps[i].class0_hp[0],
-           tnmvcounts.comps[i].class0_hp[1]);
-    printf("    %d/%d,\n",
-           tnmvcounts.comps[i].hp[0],
-           tnmvcounts.comps[i].hp[1]);
-    printf("  },\n");
   }
 }
 
@@ -253,11 +235,11 @@ void print_nmvstats() {
   unsigned int branch_ct_class0_hp[2][2];
   unsigned int branch_ct_hp[2][2];
   int i, j, k;
-  vp9_counts_to_nmv_context(&tnmvcounts, &prob, 1,
-                            branch_ct_joint, branch_ct_sign, branch_ct_classes,
-                            branch_ct_class0, branch_ct_bits,
-                            branch_ct_class0_fp, branch_ct_fp,
-                            branch_ct_class0_hp, branch_ct_hp);
+  counts_to_nmv_context(&tnmvcounts, &prob, 1,
+                        branch_ct_joint, branch_ct_sign, branch_ct_classes,
+                        branch_ct_class0, branch_ct_bits,
+                        branch_ct_class0_fp, branch_ct_fp,
+                        branch_ct_class0_hp, branch_ct_hp);
 
   printf("\nCounts =\n  { ");
   for (j = 0; j < MV_JOINTS; ++j)
@@ -394,154 +376,69 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
   unsigned int branch_ct_fp[2][4 - 1][2];
   unsigned int branch_ct_class0_hp[2][2];
   unsigned int branch_ct_hp[2][2];
-#ifdef MV_GROUP_UPDATE
-  int savings = 0;
-#endif
+  nmv_context *mvc = &cpi->common.fc.nmvc;
 
 #ifdef NMV_STATS
   if (!cpi->dummy_packing)
     add_nmvcount(&tnmvcounts, &cpi->NMVcount);
 #endif
-  vp9_counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
-                            branch_ct_joint, branch_ct_sign, branch_ct_classes,
-                            branch_ct_class0, branch_ct_bits,
-                            branch_ct_class0_fp, branch_ct_fp,
-                            branch_ct_class0_hp, branch_ct_hp);
-  /* write updates if they help */
-#ifdef MV_GROUP_UPDATE
-  for (j = 0; j < MV_JOINTS - 1; ++j) {
-    savings += update_nmv_savings(branch_ct_joint[j],
-                                  cpi->common.fc.nmvc.joints[j],
-                                  prob.joints[j],
-                                  VP9_NMV_UPDATE_PROB);
-  }
-  for (i = 0; i < 2; ++i) {
-    savings += update_nmv_savings(branch_ct_sign[i],
-                                  cpi->common.fc.nmvc.comps[i].sign,
-                                  prob.comps[i].sign,
-                                  VP9_NMV_UPDATE_PROB);
-    for (j = 0; j < MV_CLASSES - 1; ++j) {
-      savings += update_nmv_savings(branch_ct_classes[i][j],
-                                    cpi->common.fc.nmvc.comps[i].classes[j],
-                                    prob.comps[i].classes[j],
-                                    VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
-      savings += update_nmv_savings(branch_ct_class0[i][j],
-                                    cpi->common.fc.nmvc.comps[i].class0[j],
-                                    prob.comps[i].class0[j],
-                                    VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      savings += update_nmv_savings(branch_ct_bits[i][j],
-                                    cpi->common.fc.nmvc.comps[i].bits[j],
-                                    prob.comps[i].bits[j],
-                                    VP9_NMV_UPDATE_PROB);
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      int k;
-      for (k = 0; k < 3; ++k) {
-        savings += update_nmv_savings(branch_ct_class0_fp[i][j][k],
-                                      cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
-                                      prob.comps[i].class0_fp[j][k],
-                                      VP9_NMV_UPDATE_PROB);
-      }
-    }
-    for (j = 0; j < 3; ++j) {
-      savings += update_nmv_savings(branch_ct_fp[i][j],
-                                    cpi->common.fc.nmvc.comps[i].fp[j],
-                                    prob.comps[i].fp[j],
-                                    VP9_NMV_UPDATE_PROB);
-    }
-  }
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      savings += update_nmv_savings(branch_ct_class0_hp[i],
-                                    cpi->common.fc.nmvc.comps[i].class0_hp,
-                                    prob.comps[i].class0_hp,
-                                    VP9_NMV_UPDATE_PROB);
-      savings += update_nmv_savings(branch_ct_hp[i],
-                                    cpi->common.fc.nmvc.comps[i].hp,
-                                    prob.comps[i].hp,
-                                    VP9_NMV_UPDATE_PROB);
-    }
-  }
-  if (savings <= 0) {
-    vp9_write_bit(bc, 0);
-    return;
-  }
-  vp9_write_bit(bc, 1);
-#endif
+  counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
+                        branch_ct_joint, branch_ct_sign, branch_ct_classes,
+                        branch_ct_class0, branch_ct_bits,
+                        branch_ct_class0_fp, branch_ct_fp,
+                        branch_ct_class0_hp, branch_ct_hp);
+
+  for (j = 0; j < MV_JOINTS - 1; ++j)
+    update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j],
+              VP9_NMV_UPDATE_PROB);
 
-  for (j = 0; j < MV_JOINTS - 1; ++j) {
-    update_nmv(bc, branch_ct_joint[j],
-               &cpi->common.fc.nmvc.joints[j],
-               prob.joints[j],
-               VP9_NMV_UPDATE_PROB);
-  }
   for (i = 0; i < 2; ++i) {
-    update_nmv(bc, branch_ct_sign[i],
-               &cpi->common.fc.nmvc.comps[i].sign,
-               prob.comps[i].sign,
-               VP9_NMV_UPDATE_PROB);
-    for (j = 0; j < MV_CLASSES - 1; ++j) {
-      update_nmv(bc, branch_ct_classes[i][j],
-                 &cpi->common.fc.nmvc.comps[i].classes[j],
-                 prob.comps[i].classes[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
-      update_nmv(bc, branch_ct_class0[i][j],
-                 &cpi->common.fc.nmvc.comps[i].class0[j],
-                 prob.comps[i].class0[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      update_nmv(bc, branch_ct_bits[i][j],
-                 &cpi->common.fc.nmvc.comps[i].bits[j],
-                 prob.comps[i].bits[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
+    update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign,
+              prob.comps[i].sign, VP9_NMV_UPDATE_PROB);
+    for (j = 0; j < MV_CLASSES - 1; ++j)
+      update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
+                prob.comps[i].classes[j], VP9_NMV_UPDATE_PROB);
+
+    for (j = 0; j < CLASS0_SIZE - 1; ++j)
+      update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
+                prob.comps[i].class0[j], VP9_NMV_UPDATE_PROB);
+
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
+                prob.comps[i].bits[j], VP9_NMV_UPDATE_PROB);
   }
+
   for (i = 0; i < 2; ++i) {
     for (j = 0; j < CLASS0_SIZE; ++j) {
       int k;
-      for (k = 0; k < 3; ++k) {
-        update_nmv(bc, branch_ct_class0_fp[i][j][k],
-                   &cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
-                   prob.comps[i].class0_fp[j][k],
-                   VP9_NMV_UPDATE_PROB);
-      }
-    }
-    for (j = 0; j < 3; ++j) {
-      update_nmv(bc, branch_ct_fp[i][j],
-                 &cpi->common.fc.nmvc.comps[i].fp[j],
-                 prob.comps[i].fp[j],
-                 VP9_NMV_UPDATE_PROB);
+      for (k = 0; k < 3; ++k)
+        update_mv(bc, branch_ct_class0_fp[i][j][k],
+                  &mvc->comps[i].class0_fp[j][k],
+                  prob.comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
     }
+
+    for (j = 0; j < 3; ++j)
+      update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j],
+                prob.comps[i].fp[j], VP9_NMV_UPDATE_PROB);
   }
+
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      update_nmv(bc, branch_ct_class0_hp[i],
-                 &cpi->common.fc.nmvc.comps[i].class0_hp,
-                 prob.comps[i].class0_hp,
-                 VP9_NMV_UPDATE_PROB);
-      update_nmv(bc, branch_ct_hp[i],
-                 &cpi->common.fc.nmvc.comps[i].hp,
-                 prob.comps[i].hp,
-                 VP9_NMV_UPDATE_PROB);
+      update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
+                prob.comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
+      update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
+                prob.comps[i].hp, VP9_NMV_UPDATE_PROB);
     }
   }
 }
 
-void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
+void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
+                   const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp) {
   const MV diff = {mv->row - ref->row,
                    mv->col - ref->col};
   const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
-  usehp = usehp && vp9_use_nmv_hp(ref);
+  usehp = usehp && vp9_use_mv_hp(ref);
 
   write_token(w, vp9_mv_joint_tree, mvctx->joints, &vp9_mv_joint_encodings[j]);
   if (mv_joint_vertical(j))
@@ -549,6 +446,13 @@ void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
 
   if (mv_joint_horizontal(j))
     encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+  // If auto_mv_step_size is enabled then keep track of the largest
+  // motion vector component used.
+  if (!cpi->dummy_packing && cpi->sf.auto_mv_step_size) {
+    unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3;
+    cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude);
+  }
 }
 
 void vp9_build_nmv_cost_table(int *mvjoint,
@@ -567,44 +471,42 @@ void vp9_build_nmv_cost_table(int *mvjoint,
 
 void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
                          int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  MV mv;
-  int bwl = b_width_log2(mbmi->sb_type), bw = 1 << bwl;
-  int bhl = b_height_log2(mbmi->sb_type), bh = 1 << bhl;
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  MV diff;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   int idx, idy;
 
   if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
-    int i;
     PARTITION_INFO *pi = x->partition_info;
-    for (idy = 0; idy < 2; idy += bh) {
-      for (idx = 0; idx < 2; idx += bw) {
-        i = idy * 2 + idx;
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+        const int i = idy * 2 + idx;
         if (pi->bmi[i].mode == NEWMV) {
-          mv.row = (pi->bmi[i].mv.as_mv.row - best_ref_mv->as_mv.row);
-          mv.col = (pi->bmi[i].mv.as_mv.col - best_ref_mv->as_mv.col);
-          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
-                            x->e_mbd.allow_high_precision_mv);
+          diff.row = mi->bmi[i].as_mv[0].as_mv.row - best_ref_mv->as_mv.row;
+          diff.col = mi->bmi[i].as_mv[0].as_mv.col - best_ref_mv->as_mv.col;
+          vp9_inc_mv(&diff, &cpi->NMVcount);
+
           if (x->e_mbd.mode_info_context->mbmi.ref_frame[1] > INTRA_FRAME) {
-            mv.row = pi->bmi[i].second_mv.as_mv.row -
+            diff.row = mi->bmi[i].as_mv[1].as_mv.row -
                          second_best_ref_mv->as_mv.row;
-            mv.col = pi->bmi[i].second_mv.as_mv.col -
+            diff.col = mi->bmi[i].as_mv[1].as_mv.col -
                          second_best_ref_mv->as_mv.col;
-            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
-                              x->e_mbd.allow_high_precision_mv);
+            vp9_inc_mv(&diff, &cpi->NMVcount);
           }
         }
       }
     }
   } else if (mbmi->mode == NEWMV) {
-    mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
-    mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
-    vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
-                      x->e_mbd.allow_high_precision_mv);
+    diff.row = mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row;
+    diff.col = mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col;
+    vp9_inc_mv(&diff, &cpi->NMVcount);
+
     if (mbmi->ref_frame[1] > INTRA_FRAME) {
-      mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
-      mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
-      vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
-                        x->e_mbd.allow_high_precision_mv);
+      diff.row = mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row;
+      diff.col = mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col;
+      vp9_inc_mv(&diff, &cpi->NMVcount);
     }
   }
 }
diff --git a/libvpx/vp9/encoder/vp9_encodemv.h b/libvpx/vp9/encoder/vp9_encodemv.h
index cb25d85..2789ce1 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.h
+++ b/libvpx/vp9/encoder/vp9_encodemv.h
@@ -16,7 +16,7 @@
 
 void vp9_write_nmv_probs(VP9_COMP* const, int usehp, vp9_writer* const);
 
-void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
+void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp);
 
 void vp9_build_nmv_cost_table(int *mvjoint,
@@ -28,6 +28,4 @@ void vp9_build_nmv_cost_table(int *mvjoint,
 void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
                           int_mv *best_ref_mv, int_mv *second_best_ref_mv);
 
-void print_nmvcounts(nmv_context_counts tnmvcounts);
-
 #endif  // VP9_ENCODER_VP9_ENCODEMV_H_
diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index 5e26cd8..ec2e361 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c
@@ -370,19 +370,6 @@ static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *r
   }
 }
 
-static enum BlockSize get_bs(BLOCK_SIZE_TYPE b) {
-  switch (b) {
-    case BLOCK_SIZE_SB8X8:
-      return BLOCK_8X8;
-    case BLOCK_SIZE_SB16X8:
-      return BLOCK_16X8;
-    case BLOCK_SIZE_SB8X16:
-      return BLOCK_8X16;
-    default:
-      return BLOCK_16X16;
-  }
-}
-
 static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                      int_mv *ref_mv, MV *best_mv,
                                      YV12_BUFFER_CONFIG *recon_buffer,
@@ -398,7 +385,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   int n;
   vp9_variance_fn_ptr_t v_fn_ptr =
-      cpi->fn_ptr[get_bs(xd->mode_info_context->mbmi.sb_type)];
+      cpi->fn_ptr[xd->mode_info_context->mbmi.sb_type];
   int new_mv_mode_penalty = 256;
 
   int sr = 0;
@@ -514,16 +501,14 @@ void vp9_first_pass(VP9_COMP *cpi) {
   vp9_clear_system_state();  // __asm emms;
 
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
-  setup_pre_planes(xd, lst_yv12, NULL, 0, 0, NULL, NULL);
+  setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL);
   setup_dst_planes(xd, new_yv12, 0, 0);
 
   x->partition_info = x->pi;
 
   xd->mode_info_context = cm->mi;
 
-  vp9_build_block_offsets(x);
-
-  vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+  setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   vp9_frame_init_quantizer(cpi);
 
@@ -986,9 +971,11 @@ static int estimate_max_q(VP9_COMP *cpi,
 
   // Corrections for higher compression speed settings
   // (reduced compression expected)
+  // FIXME(jimbankoski): Once we settle on vp9 speed features we need to
+  // change this code.
   if (cpi->compressor_speed == 1)
     speed_correction = cpi->oxcf.cpu_used <= 5 ?
-                          1.04 + (cpi->oxcf.cpu_used * 0.04) :
+                          1.04 + (/*cpi->oxcf.cpu_used*/0 * 0.04) :
                           1.25;
 
   // Try and pick a max Q that will be high enough to encode the
@@ -1051,7 +1038,7 @@ static int estimate_cq(VP9_COMP *cpi,
   // (reduced compression expected)
   if (cpi->compressor_speed == 1) {
     if (cpi->oxcf.cpu_used <= 5)
-      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+      speed_correction = 1.04 + (/*cpi->oxcf.cpu_used*/ 0 * 0.04);
     else
       speed_correction = 1.25;
   }
@@ -1106,13 +1093,13 @@ static int estimate_cq(VP9_COMP *cpi,
 }
 
 
-extern void vp9_new_frame_rate(VP9_COMP *cpi, double framerate);
+extern void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
 void vp9_init_second_pass(VP9_COMP *cpi) {
   FIRSTPASS_STATS this_frame;
   FIRSTPASS_STATS *start_pos;
 
-  double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;
+  double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
   double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
                                       * cpi->oxcf.two_pass_vbrmin_section / 100);
 
@@ -1133,10 +1120,10 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   // encoded in the second pass is a guess.  However the sum duration is not.
   // Its calculated based on the actual durations of all frames from the first
   // pass.
-  vp9_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
+  vp9_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
                        cpi->twopass.total_stats.duration);
 
-  cpi->output_frame_rate = cpi->oxcf.frame_rate;
+  cpi->output_framerate = cpi->oxcf.framerate;
   cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
                                      cpi->oxcf.target_bandwidth / 10000000.0);
   cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration *
@@ -2216,7 +2203,7 @@ void vp9_second_pass(VP9_COMP *cpi) {
 
   // Set nominal per second bandwidth for this frame
   cpi->target_bandwidth = (int)(cpi->per_frame_bandwidth
-                                * cpi->output_frame_rate);
+                                * cpi->output_framerate);
   if (cpi->target_bandwidth < 0)
     cpi->target_bandwidth = 0;
 
@@ -2636,7 +2623,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     cpi->per_frame_bandwidth = cpi->twopass.kf_bits;
     // Convert to a per second bitrate
     cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
-                                  cpi->output_frame_rate);
+                                  cpi->output_framerate);
   }
 
   // Note the total error score of the kf group minus the key frame itself
diff --git a/libvpx/vp9/encoder/vp9_lookahead.c b/libvpx/vp9/encoder/vp9_lookahead.c
index b07d92a..81445a9 100644
--- a/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/libvpx/vp9/encoder/vp9_lookahead.c
@@ -15,8 +15,6 @@
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/common/vp9_extend.h"
 
-#define MAX_LAG_BUFFERS 25
-
 struct lookahead_ctx {
   unsigned int max_sz;         /* Absolute size of the queue */
   unsigned int sz;             /* Number of buffers currently in the queue */
diff --git a/libvpx/vp9/encoder/vp9_lookahead.h b/libvpx/vp9/encoder/vp9_lookahead.h
index 81baa2c..c773f8f 100644
--- a/libvpx/vp9/encoder/vp9_lookahead.h
+++ b/libvpx/vp9/encoder/vp9_lookahead.h
@@ -14,6 +14,8 @@
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
 
+#define MAX_LAG_BUFFERS 25
+
 struct lookahead_entry {
   YV12_BUFFER_CONFIG  img;
   int64_t             ts_start;
diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c
index 65fdcbe..7d6db07 100644
--- a/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -15,6 +15,7 @@
 #include <vp9/encoder/vp9_rdopt.h>
 #include <vp9/common/vp9_blockd.h>
 #include <vp9/common/vp9_reconinter.h>
+#include <vp9/common/vp9_reconintra.h>
 #include <vp9/common/vp9_systemdependent.h>
 #include <vp9/encoder/vp9_segmentation.h>
 
@@ -35,8 +36,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   int_mv ref_full;
 
   // Further step/diamond searches as necessary
-  int step_param = cpi->sf.first_step +
+  int step_param = cpi->sf.reduce_first_step_size +
       (cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2);
+  step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
 
   vp9_clamp_mv_min_max(x, ref_mv);
 
@@ -145,16 +147,11 @@ static int find_best_16x16_intra(VP9_COMP *cpi,
   // we're intentionally not doing 4x4, we just want a rough estimate
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     unsigned int err;
-    const int bwl = b_width_log2(BLOCK_SIZE_MB16X16),  bw = 4 << bwl;
-    const int bhl = b_height_log2(BLOCK_SIZE_MB16X16), bh = 4 << bhl;
 
     xd->mode_info_context->mbmi.mode = mode;
-    vp9_build_intra_predictors(x->plane[0].src.buf, x->plane[0].src.stride,
-                               xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-                               xd->mode_info_context->mbmi.mode,
-                               bw, bh,
-                               xd->up_available, xd->left_available,
-                               xd->right_available);
+    vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
+                            x->plane[0].src.buf, x->plane[0].src.stride,
+                            xd->plane[0].dst.buf, xd->plane[0].dst.stride);
     err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                        xd->plane[0].dst.buf, xd->plane[0].dst.stride, best_err);
 
@@ -323,8 +320,9 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
 
   int *arf_not_zz;
 
-  CHECK_MEM_ERROR(arf_not_zz,
-                  vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
+  CHECK_MEM_ERROR(cm, arf_not_zz,
+                  vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz),
+                             1));
 
   // We are not interested in results beyond the alt ref itself.
   if (n_frames > cpi->frames_till_gf_update_due)
@@ -408,8 +406,8 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
   // being a GF - so exit if we don't look ahead beyond that
   if (n_frames <= cpi->frames_till_gf_update_due)
     return;
-  if (n_frames > (int)cpi->common.frames_till_alt_ref_frame)
-    n_frames = cpi->common.frames_till_alt_ref_frame;
+  if (n_frames > (int)cpi->frames_till_alt_ref_frame)
+    n_frames = cpi->frames_till_alt_ref_frame;
   if (n_frames > MAX_LAG_BUFFERS)
     n_frames = MAX_LAG_BUFFERS;
 
diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c
index 2e99736..0be9891 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libvpx/vp9/encoder/vp9_mcomp.c
@@ -19,11 +19,13 @@
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_common.h"
 
+// #define NEW_DIAMOND_SEARCH
+
 void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
   int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +
-                                 ((ref_mv->as_mv.col & 7) ? 1 : 0);
+                ((ref_mv->as_mv.col & 7) ? 1 : 0);
   int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL +
-                                 ((ref_mv->as_mv.row & 7) ? 1 : 0);
+                ((ref_mv->as_mv.row & 7) ? 1 : 0);
   int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL;
   int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL;
 
@@ -38,16 +40,20 @@ void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
     x->mv_row_max = row_max;
 }
 
-int vp9_init_search_range(int width, int height) {
+int vp9_init_search_range(VP9_COMP *cpi, int size) {
   int sr = 0;
-  int frm = MIN(width, height);
 
-  while ((frm << sr) < MAX_FULL_PEL_VAL)
+  // Minimum search size no matter what the passed in value.
+  size = MAX(16, size);
+
+  while ((size << sr) < MAX_FULL_PEL_VAL)
     sr++;
 
   if (sr)
     sr--;
 
+  sr += cpi->sf.reduce_first_step_size;
+  sr = MIN(sr, (cpi->sf.max_step_search_steps - 2));
   return sr;
 }
 
@@ -366,7 +372,7 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
   }
 
   if (xd->allow_high_precision_mv) {
-    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+    usehp = vp9_use_mv_hp(&ref_mv->as_mv);
   } else {
     usehp = 0;
   }
@@ -447,7 +453,7 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
   int offset;
   int usehp = xd->allow_high_precision_mv;
 
-  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
   uint8_t *y = xd->plane[0].pre[0].buf +
                (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
                bestmv->as_mv.col;
@@ -556,7 +562,7 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
   }
 
   if (xd->allow_high_precision_mv) {
-    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+    usehp = vp9_use_mv_hp(&ref_mv->as_mv);
   } else {
     usehp = 0;
   }
@@ -597,8 +603,6 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
   bestmv->as_mv.row = br;
   bestmv->as_mv.col = bc;
 
-  vpx_free(comp_pred);
-
   if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
       (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
     return INT_MAX;
@@ -930,7 +934,7 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
   }
 
   if (x->e_mbd.allow_high_precision_mv) {
-    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+    usehp = vp9_use_mv_hp(&ref_mv->as_mv);
   } else {
     usehp = 0;
   }
@@ -1509,12 +1513,13 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
       this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
       this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
 
-      if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-          (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
-
-      {
+      if ((this_col_offset > x->mv_col_min) &&
+          (this_col_offset < x->mv_col_max) &&
+          (this_row_offset > x->mv_row_min) &&
+          (this_row_offset < x->mv_row_max)) {
         check_here = ss[i].offset + best_address;
-        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                              bestsad);
 
         if (thissad < bestsad) {
           this_mv.as_mv.row = this_row_offset;
@@ -1537,6 +1542,34 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
       best_mv->as_mv.col += ss[best_site].mv.col;
       best_address += ss[best_site].offset;
       last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row;
+        this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col;
+        if ((this_col_offset > x->mv_col_min) &&
+            (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) &&
+            (this_row_offset < x->mv_row_max)) {
+          check_here = ss[best_site].offset + best_address;
+          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                                bestsad);
+          if (thissad < bestsad) {
+            this_mv.as_mv.row = this_row_offset;
+            this_mv.as_mv.col = this_col_offset;
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                      mvjsadcost, mvsadcost, sad_per_bit);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_mv->as_mv.row += ss[best_site].mv.row;
+              best_mv->as_mv.col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      };
+#endif
     } else if (best_address == in_what)
       (*num00)++;
   }
@@ -1678,12 +1711,39 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
         i++;
       }
     }
-
     if (best_site != last_site) {
       best_mv->as_mv.row += ss[best_site].mv.row;
       best_mv->as_mv.col += ss[best_site].mv.col;
       best_address += ss[best_site].offset;
       last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row;
+        this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col;
+        if ((this_col_offset > x->mv_col_min) &&
+            (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) &&
+            (this_row_offset < x->mv_row_max)) {
+          check_here = ss[best_site].offset + best_address;
+          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                                bestsad);
+          if (thissad < bestsad) {
+            this_mv.as_mv.row = this_row_offset;
+            this_mv.as_mv.col = this_col_offset;
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                      mvjsadcost, mvsadcost, sad_per_bit);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_mv->as_mv.row += ss[best_site].mv.row;
+              best_mv->as_mv.col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      };
+#endif
     } else if (best_address == in_what)
       (*num00)++;
   }
@@ -1704,6 +1764,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
+
 int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
                            int_mv *mvp_full, int step_param,
                            int sadpb, int further_steps,
@@ -2355,16 +2416,12 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
   int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  /* Compound pred buffer */
-  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
-
   fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
   /* Get compound pred by averaging two pred blocks. */
-  comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
-
-  bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+  bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride,
+                         second_pred, 0x7fffffff) +
       mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
 
   for (i = 0; i < search_range; i++) {
@@ -2382,9 +2439,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
             best_address;
 
         /* Get compound block and use it to calculate SAD. */
-        comp_avg_pred(comp_pred, second_pred, w, h, check_here,
-                      in_what_stride);
-        thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+        thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride,
+                               second_pred, bestsad);
 
         if (thissad < bestsad) {
           this_mv.as_mv.row = this_row_offset;
@@ -2414,16 +2470,15 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
   this_mv.as_mv.col = ref_mv->as_mv.col << 3;
 
   if (bestsad < INT_MAX) {
-    int besterr;
-    comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
-    besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
-        (unsigned int *)(&thissad)) +
+    // FIXME(rbultje, yunqing): add full-pixel averaging variance functions
+    // so we don't have to use the subpixel with xoff=0,yoff=0 here.
+    int besterr = fn_ptr->svaf(best_address, in_what_stride, 0, 0,
+                               what, what_stride, (unsigned int *)(&thissad),
+                               second_pred) +
         mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
                     xd->allow_high_precision_mv);
-    vpx_free(comp_pred);
     return besterr;
   } else {
-    vpx_free(comp_pred);
     return INT_MAX;
   }
 }
diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h
index 28b2efd..c13ea75 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/libvpx/vp9/encoder/vp9_mcomp.h
@@ -24,15 +24,15 @@
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
 
 void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
-int vp9_init_search_range(int width, int height);
-
 int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,
                            int *mvcost[2], int weight, int ishp);
 void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
 void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);
 
-// Runs sequence of diamond searches in smaller steps for RD
 struct VP9_COMP;
+int vp9_init_search_range(struct VP9_COMP *cpi, int size);
+
+// Runs sequence of diamond searches in smaller steps for RD
 int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
                            int_mv *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
diff --git a/libvpx/vp9/encoder/vp9_modecosts.c b/libvpx/vp9/encoder/vp9_modecosts.c
index f2e4ce4..993aba7 100644
--- a/libvpx/vp9/encoder/vp9_modecosts.c
+++ b/libvpx/vp9/encoder/vp9_modecosts.c
@@ -22,8 +22,8 @@ void vp9_init_mode_costs(VP9_COMP *c) {
 
   for (i = 0; i < VP9_INTRA_MODES; i++) {
     for (j = 0; j < VP9_INTRA_MODES; j++) {
-      vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j],
-                      x->kf_y_mode_prob[i][j], KT);
+      vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
+                      KT);
     }
   }
 
@@ -33,7 +33,8 @@ void vp9_init_mode_costs(VP9_COMP *c) {
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
                   x->fc.uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
-                  x->kf_uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree);
+                  vp9_kf_uv_mode_prob[VP9_INTRA_MODES - 1],
+                  vp9_intra_mode_tree);
 
   for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c
index 6a14df4..e5f1a5c 100644
--- a/libvpx/vp9/encoder/vp9_onyx_if.c
+++ b/libvpx/vp9/encoder/vp9_onyx_if.c
@@ -131,6 +131,32 @@ static int gf_low_motion_minq[QINDEX_RANGE];
 static int gf_high_motion_minq[QINDEX_RANGE];
 static int inter_minq[QINDEX_RANGE];
 
+static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
+  switch (mode) {
+    case NORMAL:
+      *hr = 1;
+      *hs = 1;
+      break;
+    case FOURFIVE:
+      *hr = 4;
+      *hs = 5;
+      break;
+    case THREEFIVE:
+      *hr = 3;
+      *hs = 5;
+    break;
+    case ONETWO:
+      *hr = 1;
+      *hs = 2;
+    break;
+    default:
+      *hr = 1;
+      *hs = 1;
+       assert(0);
+      break;
+  }
+}
+
 // Functions to compute the active minq lookup table entries based on a
 // formulaic approach to facilitate easier adjustment of the Q tables.
 // The formulae were derived from computing a 3rd order polynomial best
@@ -217,22 +243,23 @@ void vp9_initialize_enc() {
 
 static void setup_features(VP9_COMP *cpi) {
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  struct loopfilter *lf = &xd->lf;
 
   // Set up default state for MB feature flags
-  xd->segmentation_enabled = 0;
+  xd->seg.enabled = 0;
 
-  xd->update_mb_segmentation_map = 0;
-  xd->update_mb_segmentation_data = 0;
-  vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+  xd->seg.update_map = 0;
+  xd->seg.update_data = 0;
+  vpx_memset(xd->seg.tree_probs, 255, sizeof(xd->seg.tree_probs));
 
-  vp9_clearall_segfeatures(xd);
+  vp9_clearall_segfeatures(&xd->seg);
 
-  xd->mode_ref_lf_delta_enabled = 0;
-  xd->mode_ref_lf_delta_update = 0;
-  vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
-  vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
-  vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
-  vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+  lf->mode_ref_delta_enabled = 0;
+  lf->mode_ref_delta_update = 0;
+  vp9_zero(lf->ref_deltas);
+  vp9_zero(lf->mode_deltas);
+  vp9_zero(lf->last_ref_deltas);
+  vp9_zero(lf->last_mode_deltas);
 
   set_default_lf_deltas(cpi);
 }
@@ -305,26 +332,26 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
   if (cm->frame_type == KEY_FRAME) {
     // Clear down the global segmentation map
     vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
-    xd->update_mb_segmentation_map = 0;
-    xd->update_mb_segmentation_data = 0;
+    xd->seg.update_map = 0;
+    xd->seg.update_data = 0;
     cpi->static_mb_pct = 0;
 
     // Disable segmentation
     vp9_disable_segmentation((VP9_PTR)cpi);
 
     // Clear down the segment features.
-    vp9_clearall_segfeatures(xd);
+    vp9_clearall_segfeatures(&xd->seg);
   } else if (cpi->refresh_alt_ref_frame) {
     // If this is an alt ref frame
     // Clear down the global segmentation map
     vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
-    xd->update_mb_segmentation_map = 0;
-    xd->update_mb_segmentation_data = 0;
+    xd->seg.update_map = 0;
+    xd->seg.update_data = 0;
     cpi->static_mb_pct = 0;
 
     // Disable segmentation and individual segment features by default
     vp9_disable_segmentation((VP9_PTR)cpi);
-    vp9_clearall_segfeatures(xd);
+    vp9_clearall_segfeatures(&xd->seg);
 
     // Scan frames from current to arf frame.
     // This function re-enables segmentation if appropriate.
@@ -332,45 +359,45 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
 
     // If segmentation was enabled set those features needed for the
     // arf itself.
-    if (xd->segmentation_enabled) {
-      xd->update_mb_segmentation_map = 1;
-      xd->update_mb_segmentation_data = 1;
+    if (xd->seg.enabled) {
+      xd->seg.update_map = 1;
+      xd->seg.update_data = 1;
 
       qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));
-      vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
-      vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);
+      vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
+      vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_LF, -2);
 
-      vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);
-      vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);
+      vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_Q);
+      vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_LF);
 
       // Where relevant assume segment data is delta data
-      xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+      xd->seg.abs_delta = SEGMENT_DELTADATA;
 
     }
-  } else if (xd->segmentation_enabled) {
+  } else if (xd->seg.enabled) {
     // All other frames if segmentation has been enabled
 
     // First normal frame in a valid gf or alt ref group
-    if (cpi->common.frames_since_golden == 0) {
+    if (cpi->frames_since_golden == 0) {
       // Set up segment features for normal frames in an arf group
       if (cpi->source_alt_ref_active) {
-        xd->update_mb_segmentation_map = 0;
-        xd->update_mb_segmentation_data = 1;
-        xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+        xd->seg.update_map = 0;
+        xd->seg.update_data = 1;
+        xd->seg.abs_delta = SEGMENT_DELTADATA;
 
         qi_delta = compute_qdelta(cpi, cpi->avg_q,
                                   (cpi->avg_q * 1.125));
-        vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
-        vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);
+        vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
+        vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_Q);
 
-        vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);
-        vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);
+        vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_LF, -2);
+        vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_LF);
 
         // Segment coding disabled for compred testing
         if (high_q || (cpi->static_mb_pct == 100)) {
-          vp9_set_segdata(xd, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
-          vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
-          vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP);
+          vp9_set_segdata(&xd->seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+          vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_REF_FRAME);
+          vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_SKIP);
         }
       } else {
         // Disable segmentation and clear down features if alt ref
@@ -380,10 +407,10 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
 
         vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
 
-        xd->update_mb_segmentation_map = 0;
-        xd->update_mb_segmentation_data = 0;
+        xd->seg.update_map = 0;
+        xd->seg.update_data = 0;
 
-        vp9_clearall_segfeatures(xd);
+        vp9_clearall_segfeatures(&xd->seg);
       }
     } else if (cpi->is_src_frame_alt_ref) {
       // Special case where we are coding over the top of a previous
@@ -391,28 +418,28 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
       // Segment coding disabled for compred testing
 
       // Enable ref frame features for segment 0 as well
-      vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);
-      vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
+      vp9_enable_segfeature(&xd->seg, 0, SEG_LVL_REF_FRAME);
+      vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_REF_FRAME);
 
       // All mbs should use ALTREF_FRAME
-      vp9_clear_segdata(xd, 0, SEG_LVL_REF_FRAME);
-      vp9_set_segdata(xd, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
-      vp9_clear_segdata(xd, 1, SEG_LVL_REF_FRAME);
-      vp9_set_segdata(xd, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      vp9_clear_segdata(&xd->seg, 0, SEG_LVL_REF_FRAME);
+      vp9_set_segdata(&xd->seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      vp9_clear_segdata(&xd->seg, 1, SEG_LVL_REF_FRAME);
+      vp9_set_segdata(&xd->seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
 
       // Skip all MBs if high Q (0,0 mv and skip coeffs)
       if (high_q) {
-          vp9_enable_segfeature(xd, 0, SEG_LVL_SKIP);
-          vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP);
+          vp9_enable_segfeature(&xd->seg, 0, SEG_LVL_SKIP);
+          vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_SKIP);
       }
-      // Enable data udpate
-      xd->update_mb_segmentation_data = 1;
+      // Enable data update
+      xd->seg.update_data = 1;
     } else {
       // All other frames.
 
       // No updates.. leave things as they are.
-      xd->update_mb_segmentation_map = 0;
-      xd->update_mb_segmentation_data = 0;
+      xd->seg.update_map = 0;
+      xd->seg.update_data = 0;
     }
   }
 }
@@ -518,20 +545,22 @@ static void update_reference_segmentation_map(VP9_COMP *cpi) {
 }
 
 static void set_default_lf_deltas(VP9_COMP *cpi) {
-  cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
-  cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
+  struct loopfilter *lf = &cpi->mb.e_mbd.lf;
+
+  lf->mode_ref_delta_enabled = 1;
+  lf->mode_ref_delta_update = 1;
 
-  vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
-  vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+  vp9_zero(lf->ref_deltas);
+  vp9_zero(lf->mode_deltas);
 
   // Test of ref frame deltas
-  cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
-  cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;
-  cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;
-  cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;
+  lf->ref_deltas[INTRA_FRAME] = 2;
+  lf->ref_deltas[LAST_FRAME] = 0;
+  lf->ref_deltas[GOLDEN_FRAME] = -2;
+  lf->ref_deltas[ALTREF_FRAME] = -2;
 
-  cpi->mb.e_mbd.mode_lf_deltas[0] = 0;              // Zero
-  cpi->mb.e_mbd.mode_lf_deltas[1] = 0;               // New mv
+  lf->mode_deltas[0] = 0;   // Zero
+  lf->mode_deltas[1] = 0;   // New mv
 }
 
 static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
@@ -543,70 +572,70 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
   for (i = 0; i < MAX_MODES; ++i)
     sf->thresh_mult[i] = mode == 0 ? -500 : 0;
 
-  sf->thresh_mult[THR_ZEROMV   ] = 0;
-  sf->thresh_mult[THR_ZEROG    ] = 0;
-  sf->thresh_mult[THR_ZEROA    ] = 0;
-
   sf->thresh_mult[THR_NEARESTMV] = 0;
-  sf->thresh_mult[THR_NEARESTG ] = 0;
-  sf->thresh_mult[THR_NEARESTA ] = 0;
-
-  sf->thresh_mult[THR_NEARMV   ] += speed_multiplier * 1000;
-  sf->thresh_mult[THR_NEARG    ] += speed_multiplier * 1000;
-  sf->thresh_mult[THR_NEARA    ] += speed_multiplier * 1000;
-
-  sf->thresh_mult[THR_DC       ] = 0;
-  sf->thresh_mult[THR_TM       ] += speed_multiplier * 1000;
-  sf->thresh_mult[THR_V_PRED   ] += speed_multiplier * 1000;
-  sf->thresh_mult[THR_H_PRED   ] += speed_multiplier * 1000;
-  sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_D27_PRED ] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_D63_PRED ] += speed_multiplier * 1500;
-
-  sf->thresh_mult[THR_B_PRED   ] += speed_multiplier * 2500;
-
-  sf->thresh_mult[THR_NEWMV    ] += speed_multiplier * 1000;
-  sf->thresh_mult[THR_NEWG     ] += speed_multiplier * 1000;
-  sf->thresh_mult[THR_NEWA     ] += speed_multiplier * 1000;
-
-  sf->thresh_mult[THR_SPLITMV  ] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_SPLITG   ] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_SPLITA   ] += speed_multiplier * 2500;
-
-  sf->thresh_mult[THR_COMP_ZEROLA   ] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_ZEROGA   ] += speed_multiplier * 1500;
-
-  sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1500;
-
-  sf->thresh_mult[THR_COMP_NEARLA   ] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_NEARGA   ] += speed_multiplier * 1500;
-
-  sf->thresh_mult[THR_COMP_NEWLA    ] += speed_multiplier * 2000;
-  sf->thresh_mult[THR_COMP_NEWGA    ] += speed_multiplier * 2000;
-
-  sf->thresh_mult[THR_COMP_SPLITLA  ] += speed_multiplier * 4500;
-  sf->thresh_mult[THR_COMP_SPLITGA  ] += speed_multiplier * 4500;
-
-  if (speed > 4) {
+  sf->thresh_mult[THR_NEARESTG] = 0;
+  sf->thresh_mult[THR_NEARESTA] = 0;
+
+  sf->thresh_mult[THR_NEWMV] += speed_multiplier * 1000;
+  sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1000;
+  sf->thresh_mult[THR_NEARMV] += speed_multiplier * 1000;
+  sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1000;
+
+  sf->thresh_mult[THR_DC] += speed_multiplier * 1000;
+
+  sf->thresh_mult[THR_NEWG] += speed_multiplier * 1000;
+  sf->thresh_mult[THR_NEWA] += speed_multiplier * 1000;
+  sf->thresh_mult[THR_NEARA] += speed_multiplier * 1000;
+
+  sf->thresh_mult[THR_TM] += speed_multiplier * 1000;
+
+  sf->thresh_mult[THR_COMP_NEARLA] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_NEWLA] += speed_multiplier * 2000;
+  sf->thresh_mult[THR_NEARG] += speed_multiplier * 1000;
+  sf->thresh_mult[THR_COMP_NEARGA] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_NEWGA] += speed_multiplier * 2000;
+
+  sf->thresh_mult[THR_SPLITMV] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_SPLITG] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_SPLITA] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_COMP_SPLITLA] += speed_multiplier * 4500;
+  sf->thresh_mult[THR_COMP_SPLITGA] += speed_multiplier * 4500;
+
+  sf->thresh_mult[THR_ZEROMV] += speed_multiplier * 2000;
+  sf->thresh_mult[THR_ZEROG] += speed_multiplier * 2000;
+  sf->thresh_mult[THR_ZEROA] += speed_multiplier * 2000;
+  sf->thresh_mult[THR_COMP_ZEROLA] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_COMP_ZEROGA] += speed_multiplier * 2500;
+
+  sf->thresh_mult[THR_B_PRED] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_H_PRED] += speed_multiplier * 2000;
+  sf->thresh_mult[THR_V_PRED] += speed_multiplier * 2000;
+  sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_D27_PRED] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_D63_PRED] += speed_multiplier * 2500;
+
+  if (cpi->sf.skip_lots_of_modes) {
     for (i = 0; i < MAX_MODES; ++i)
       sf->thresh_mult[i] = INT_MAX;
 
-    sf->thresh_mult[THR_DC       ] = 0;
-    sf->thresh_mult[THR_TM       ] = 0;
-    sf->thresh_mult[THR_NEWMV    ] = 4000;
-    sf->thresh_mult[THR_NEWG     ] = 4000;
-    sf->thresh_mult[THR_NEWA     ] = 4000;
+    sf->thresh_mult[THR_DC] = 2000;
+    sf->thresh_mult[THR_TM] = 2000;
+    sf->thresh_mult[THR_NEWMV] = 4000;
+    sf->thresh_mult[THR_NEWG] = 4000;
+    sf->thresh_mult[THR_NEWA] = 4000;
     sf->thresh_mult[THR_NEARESTMV] = 0;
-    sf->thresh_mult[THR_NEARESTG ] = 0;
-    sf->thresh_mult[THR_NEARESTA ] = 0;
-    sf->thresh_mult[THR_NEARMV   ] = 2000;
-    sf->thresh_mult[THR_NEARG    ] = 2000;
-    sf->thresh_mult[THR_NEARA    ] = 2000;
+    sf->thresh_mult[THR_NEARESTG] = 0;
+    sf->thresh_mult[THR_NEARESTA] = 0;
+    sf->thresh_mult[THR_NEARMV] = 2000;
+    sf->thresh_mult[THR_NEARG] = 2000;
+    sf->thresh_mult[THR_NEARA] = 2000;
     sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
+    sf->thresh_mult[THR_SPLITMV] = 2500;
+    sf->thresh_mult[THR_SPLITG] = 2500;
+    sf->thresh_mult[THR_SPLITA] = 2500;
     sf->recode_loop = 0;
   }
 
@@ -649,6 +678,15 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
     sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
     sf->thresh_mult[THR_COMP_SPLITGA  ] = INT_MAX;
   }
+
+  if (sf->disable_splitmv == 1) {
+    sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
+    sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
+    sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
+
+    sf->thresh_mult[THR_COMP_SPLITLA  ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_SPLITGA  ] = INT_MAX;
+  }
 }
 
 void vp9_set_speed_features(VP9_COMP *cpi) {
@@ -677,10 +715,38 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->half_pixel_search = 1;
   sf->iterative_sub_pixel = 1;
   sf->optimize_coefficients = !cpi->oxcf.lossless;
-  sf->first_step = 0;
+  sf->reduce_first_step_size = 0;
+  sf->auto_mv_step_size = 0;
   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
   sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4;
-  sf->adpative_rd_thresh = 0;
+  sf->adaptive_rd_thresh = 0;
+  sf->use_lastframe_partitioning = 0;
+  sf->tx_size_search_method = USE_FULL_RD;
+  sf->use_8tap_always = 0;
+  sf->use_avoid_tested_higherror = 0;
+  sf->reference_masking = 0;
+  sf->skip_lots_of_modes = 0;
+  sf->adjust_thresholds_by_speed = 0;
+  sf->partition_by_variance = 0;
+  sf->use_one_partition_size_always = 0;
+  sf->less_rectangular_check = 0;
+  sf->use_square_partition_only = 0;
+  sf->use_partitions_less_than = 0;
+  sf->less_than_block_size = BLOCK_SIZE_MB16X16;
+  sf->use_partitions_greater_than = 0;
+  sf->greater_than_block_size = BLOCK_SIZE_SB8X8;
+  sf->adjust_partitioning_from_last_frame = 0;
+  sf->last_partitioning_redo_frequency = 4;
+  sf->disable_splitmv = 0;
+  sf->mode_search_skip_flags = 0;
+  sf->last_chroma_intra_mode = TM_PRED;
+  sf->use_rd_breakout = 0;
+  sf->skip_encode_sb = 0;
+  sf->use_uv_intra_rd_estimate = 0;
+  sf->using_small_partition_info = 0;
+  // Skip any mode not chosen at size < X for all sizes > X
+  // Hence BLOCK_SIZE_SB64X64 (skip is off)
+  sf->unused_mode_skip_lvl = BLOCK_SIZE_SB64X64;
 
 #if CONFIG_MULTIPLE_ARF
   // Switch segmentation off.
@@ -701,19 +767,121 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 #else
       sf->static_segmentation = 0;
 #endif
-      sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
-      sf->adpative_rd_thresh = 1;
-      if (speed > 0) {
+      sf->use_avoid_tested_higherror = 1;
+      sf->adaptive_rd_thresh = 1;
+      sf->last_chroma_intra_mode = TM_PRED;
+
+      if (speed == 1) {
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+        sf->less_rectangular_check  = 1;
+        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+                                      cpi->common.intra_only ||
+                                      cpi->common.show_frame == 0) ?
+                                     USE_FULL_RD :
+                                     USE_LARGESTALL);
+        sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME ||
+                                   cpi->common.intra_only ||
+                                   cpi->common.show_frame == 0);
+        sf->disable_splitmv =
+            (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
+        sf->unused_mode_skip_lvl = BLOCK_SIZE_SB32X32;
+        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+                                     FLAG_SKIP_INTRA_BESTINTER |
+                                     FLAG_SKIP_COMP_BESTINTRA;
+        sf->last_chroma_intra_mode = H_PRED;
+        sf->use_rd_breakout = 1;
+        sf->skip_encode_sb = 1;
+        sf->auto_mv_step_size = 1;
+      }
+      if (speed == 2) {
+        sf->adjust_thresholds_by_speed = 1;
+        sf->less_rectangular_check  = 1;
+        sf->use_square_partition_only = 1;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+        sf->use_lastframe_partitioning = 1;
+        sf->adjust_partitioning_from_last_frame = 1;
+        sf->last_partitioning_redo_frequency = 3;
+        sf->unused_mode_skip_lvl = BLOCK_SIZE_SB32X32;
+        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+                                      cpi->common.intra_only ||
+                                      cpi->common.show_frame == 0) ?
+                                     USE_FULL_RD :
+                                     USE_LARGESTALL);
+        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+                                     FLAG_SKIP_INTRA_BESTINTER |
+                                     FLAG_SKIP_COMP_BESTINTRA |
+                                     FLAG_SKIP_COMP_REFMISMATCH;
+        sf->last_chroma_intra_mode = DC_PRED;
+        sf->use_rd_breakout = 1;
+        sf->skip_encode_sb = 1;
+        sf->use_uv_intra_rd_estimate = 1;
+        sf->using_small_partition_info = 1;
+        sf->disable_splitmv =
+            (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
+        sf->auto_mv_step_size = 1;
+      }
+      if (speed == 3) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+        sf->partition_by_variance = 1;
+        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+                                      cpi->common.intra_only ||
+                                      cpi->common.show_frame == 0) ?
+                                     USE_FULL_RD :
+                                     USE_LARGESTALL);
+        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+                                     FLAG_SKIP_INTRA_BESTINTER |
+                                     FLAG_SKIP_COMP_BESTINTRA |
+                                     FLAG_SKIP_COMP_REFMISMATCH;
+        sf->use_rd_breakout = 1;
+        sf->skip_encode_sb = 1;
+        sf->disable_splitmv = 1;
+        sf->auto_mv_step_size = 1;
+      }
+      if (speed == 4) {
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+        sf->use_one_partition_size_always = 1;
+        sf->always_this_block_size = BLOCK_SIZE_MB16X16;
+        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+                                      cpi->common.intra_only ||
+                                      cpi->common.show_frame == 0) ?
+                                     USE_FULL_RD :
+                                     USE_LARGESTALL);
+        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+                                     FLAG_SKIP_INTRA_BESTINTER |
+                                     FLAG_SKIP_COMP_BESTINTRA |
+                                     FLAG_SKIP_COMP_REFMISMATCH;
+        sf->use_rd_breakout = 1;
         sf->optimize_coefficients = 0;
-        sf->first_step = 1;
+        sf->auto_mv_step_size = 1;
+        // sf->reduce_first_step_size = 1;
+        // sf->reference_masking = 1;
+
+        sf->disable_splitmv = 1;
+      }
+      /*
+      if (speed == 2) {
+        sf->first_step = 0;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+        sf->use_partitions_less_than = 1;
+        sf->less_than_block_size = BLOCK_SIZE_MB16X16;
+      }
+      if (speed == 3) {
+        sf->first_step = 0;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+        sf->use_partitions_greater_than = 1;
+        sf->greater_than_block_size = BLOCK_SIZE_SB8X8;
       }
+      */
+
       break;
 
   }; /* switch */
 
   // Set rd thresholds based on mode and speed setting
-  set_rd_speed_thresholds(cpi, mode, speed);
+  if (cpi->sf.adjust_thresholds_by_speed)
+    set_rd_speed_thresholds(cpi, mode, speed);
+  else
+    set_rd_speed_thresholds(cpi, mode, 0);
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
@@ -732,8 +900,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 
   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
 
-  vp9_init_quantizer(cpi);
-
   if (cpi->sf.iterative_sub_pixel == 1) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step_iteratively;
   } else if (cpi->sf.quarter_pixel_search) {
@@ -770,8 +936,8 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
 static int alloc_partition_data(VP9_COMP *cpi) {
   vpx_free(cpi->mb.pip);
 
-  cpi->mb.pip = vpx_calloc((cpi->common.mode_info_stride) *
-                           (cpi->common.mi_rows + 64 / MI_SIZE),
+  cpi->mb.pip = vpx_calloc(cpi->common.mode_info_stride *
+                           (cpi->common.mi_rows + MI_BLOCK_SIZE),
                            sizeof(PARTITION_INFO));
   if (!cpi->mb.pip)
     return 1;
@@ -811,7 +977,7 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
   {
     unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
 
-    CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+    CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
   }
 
   // Data used for real time vc mode to see if gf needs refreshing
@@ -820,12 +986,12 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
   cpi->gf_update_recommended = 0;
 
   vpx_free(cpi->mb_activity_map);
-  CHECK_MEM_ERROR(cpi->mb_activity_map,
+  CHECK_MEM_ERROR(cm, cpi->mb_activity_map,
                   vpx_calloc(sizeof(unsigned int),
                              cm->mb_rows * cm->mb_cols));
 
   vpx_free(cpi->mb_norm_activity_map);
-  CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
+  CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map,
                   vpx_calloc(sizeof(unsigned int),
                              cm->mb_rows * cm->mb_cols));
 }
@@ -889,14 +1055,14 @@ int vp9_reverse_trans(int x) {
 
   return 63;
 };
-void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) {
+void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
   if (framerate < 0.1)
     framerate = 30;
 
-  cpi->oxcf.frame_rate             = framerate;
-  cpi->output_frame_rate            = cpi->oxcf.frame_rate;
-  cpi->per_frame_bandwidth          = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
-  cpi->av_per_frame_bandwidth        = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
+  cpi->oxcf.framerate             = framerate;
+  cpi->output_framerate            = cpi->oxcf.framerate;
+  cpi->per_frame_bandwidth          = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
+  cpi->av_per_frame_bandwidth        = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
   cpi->min_frame_bandwidth          = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
 
 
@@ -931,19 +1097,13 @@ static int64_t rescale(int val, int64_t num, int denom) {
 
 static void set_tile_limits(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  int min_log2_tiles, max_log2_tiles;
 
-  cm->log2_tile_columns = cpi->oxcf.tile_columns;
-  cm->log2_tile_rows = cpi->oxcf.tile_rows;
+  int min_log2_tile_cols, max_log2_tile_cols;
+  vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
-  vp9_get_tile_n_bits(cm, &min_log2_tiles, &max_log2_tiles);
-  max_log2_tiles += min_log2_tiles;
-
-  cm->log2_tile_columns = clamp(cm->log2_tile_columns,
-                                min_log2_tiles, max_log2_tiles);
-
-  cm->tile_columns = 1 << cm->log2_tile_columns;
-  cm->tile_rows = 1 << cm->log2_tile_rows;
+  cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+                             min_log2_tile_cols, max_log2_tile_cols);
+  cm->log2_tile_rows = cpi->oxcf.tile_rows;
 }
 
 static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
@@ -1059,7 +1219,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   {
     int i;
 
-    for (i = 0; i < MAX_MB_SEGMENTS; i++)
+    for (i = 0; i < MAX_SEGMENTS; i++)
       cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
   }
 
@@ -1093,7 +1253,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
                                             cpi->oxcf.target_bandwidth, 1000);
 
   // Set up frame rate and related parameters rate control values.
-  vp9_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+  vp9_new_framerate(cpi, cpi->oxcf.framerate);
 
   // Set absolute upper and lower quality limits
   cpi->worst_quality = cpi->oxcf.worst_allowed_q;
@@ -1122,7 +1282,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
   cpi->oxcf.Sharpness = MIN(7, cpi->oxcf.Sharpness);
 
-  cm->sharpness_level = cpi->oxcf.Sharpness;
+  cpi->mb.e_mbd.lf.sharpness_level = cpi->oxcf.Sharpness;
 
   if (cpi->initial_width) {
     // Increasing the size of the frame beyond the first seen frame, or some
@@ -1233,15 +1393,16 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
     return 0;
   }
 
-  cpi->common.error.setjmp = 1;
+  cm->error.setjmp = 1;
 
-  CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
+  CHECK_MEM_ERROR(cm, cpi->mb.ss, vpx_calloc(sizeof(search_site),
+                                             (MAX_MVSEARCH_STEPS * 8) + 1));
 
-  vp9_create_common(&cpi->common);
+  vp9_create_common(cm);
 
   init_config((VP9_PTR)cpi, oxcf);
 
-  cpi->common.current_video_frame   = 0;
+  cm->current_video_frame   = 0;
   cpi->kf_overspend_bits            = 0;
   cpi->kf_bitrate_adjustment        = 0;
   cpi->frames_till_gf_update_due    = 0;
@@ -1249,7 +1410,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   cpi->non_gf_bitrate_adjustment    = 0;
 
   // Set reference frame sign bias for ALTREF frame to 1 (for now)
-  cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+  cm->ref_frame_sign_bias[ALTREF_FRAME] = 1;
 
   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
 
@@ -1258,28 +1419,27 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   cpi->gold_is_alt  = 0;
 
   // Create the encoder segmentation map and set all entries to 0
-  CHECK_MEM_ERROR(cpi->segmentation_map,
-                  vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));
+  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
   // And a copy in common for temporal coding
-  CHECK_MEM_ERROR(cm->last_frame_seg_map,
-                  vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));
+  CHECK_MEM_ERROR(cm, cm->last_frame_seg_map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
   // And a place holder structure is the coding context
   // for use if we want to save and restore it
-  CHECK_MEM_ERROR(cpi->coding_context.last_frame_seg_map_copy,
-                  vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));
+  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
-  CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
-  vpx_memset(cpi->active_map, 1, (cpi->common.mb_rows * cpi->common.mb_cols));
+  CHECK_MEM_ERROR(cm, cpi->active_map, vpx_calloc(cm->MBs, 1));
+  vpx_memset(cpi->active_map, 1, cm->MBs);
   cpi->active_map_enabled = 0;
 
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
                    sizeof(cpi->mbgraph_stats[0])); i++) {
-    CHECK_MEM_ERROR(cpi->mbgraph_stats[i].mb_stats,
-                    vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols *
-                               sizeof(*cpi->mbgraph_stats[i].mb_stats),
-                               1));
+    CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats,
+                    vpx_calloc(cm->MBs *
+                               sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
   }
 
 #ifdef ENTROPY_STATS
@@ -1385,7 +1545,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
 
   for (i = 0; i < KEY_FRAME_CONTEXT; i++)
-    cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
+    cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate;
 
 #ifdef OUTPUT_YUV_SRC
   yuv_file = fopen("bd.yuv", "ab");
@@ -1420,8 +1580,10 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   for (i = 0; i < MAX_MODES; i++)
     cpi->rd_thresh_mult[i] = 128;
 
-#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
+            SDX3F, SDX8F, SDX4DF)\
     cpi->fn_ptr[BT].sdf            = SDF; \
+    cpi->fn_ptr[BT].sdaf           = SDAF; \
     cpi->fn_ptr[BT].vf             = VF; \
     cpi->fn_ptr[BT].svf            = SVF; \
     cpi->fn_ptr[BT].svaf           = SVAF; \
@@ -1432,67 +1594,80 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
     cpi->fn_ptr[BT].sdx8f          = SDX8F; \
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
-  BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
+  BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
+      vp9_variance32x16, vp9_sub_pixel_variance32x16,
       vp9_sub_pixel_avg_variance32x16, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad32x16x4d)
 
-  BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
+  BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
+      vp9_variance16x32, vp9_sub_pixel_variance16x32,
       vp9_sub_pixel_avg_variance16x32, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad16x32x4d)
 
-  BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
+  BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
+      vp9_variance64x32, vp9_sub_pixel_variance64x32,
       vp9_sub_pixel_avg_variance64x32, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad64x32x4d)
 
-  BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
+  BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
+      vp9_variance32x64, vp9_sub_pixel_variance32x64,
       vp9_sub_pixel_avg_variance32x64, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad32x64x4d)
 
-  BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
+  BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
+      vp9_variance32x32, vp9_sub_pixel_variance32x32,
       vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
       vp9_variance_halfpixvar32x32_v,
       vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
       vp9_sad32x32x4d)
 
-  BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
+  BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
+      vp9_variance64x64, vp9_sub_pixel_variance64x64,
       vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
       vp9_variance_halfpixvar64x64_v,
       vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
       vp9_sad64x64x4d)
 
-  BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
+  BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
+      vp9_variance16x16, vp9_sub_pixel_variance16x16,
       vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
       vp9_variance_halfpixvar16x16_v,
       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
       vp9_sad16x16x4d)
 
-  BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
+  BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
+      vp9_variance16x8, vp9_sub_pixel_variance16x8,
       vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
       vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
 
-  BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
+  BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
+      vp9_variance8x16, vp9_sub_pixel_variance8x16,
       vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
       vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
 
-  BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
+  BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
+      vp9_variance8x8, vp9_sub_pixel_variance8x8,
       vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
       vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
 
-  BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4,
+  BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
+      vp9_variance8x4, vp9_sub_pixel_variance8x4,
       vp9_sub_pixel_avg_variance8x4, NULL, NULL,
       NULL, NULL, vp9_sad8x4x8,
       vp9_sad8x4x4d)
 
-  BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8,
+  BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
+      vp9_variance4x8, vp9_sub_pixel_variance4x8,
       vp9_sub_pixel_avg_variance4x8, NULL, NULL,
       NULL, NULL, vp9_sad4x8x8,
       vp9_sad4x8x4d)
 
-  BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
+  BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
+      vp9_variance4x4, vp9_sub_pixel_variance4x4,
       vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
       vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
@@ -1510,7 +1685,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
    */
   vp9_init_quantizer(cpi);
 
-  vp9_loop_filter_init(cm);
+  vp9_loop_filter_init(cm, &cpi->mb.e_mbd.lf);
 
   cpi->common.error.setjmp = 0;
 
@@ -1756,8 +1931,8 @@ static void generate_psnr_packet(VP9_COMP *cpi) {
   struct vpx_codec_cx_pkt  pkt;
   uint64_t                 sse;
   int                      i;
-  unsigned int             width = cpi->common.width;
-  unsigned int             height = cpi->common.height;
+  unsigned int             width = orig->y_crop_width;
+  unsigned int             height = orig->y_crop_height;
 
   pkt.kind = VPX_CODEC_PSNR_PKT;
   sse = calc_plane_error(orig->y_buffer, orig->y_stride,
@@ -1768,8 +1943,8 @@ static void generate_psnr_packet(VP9_COMP *cpi) {
   pkt.data.psnr.samples[0] = width * height;
   pkt.data.psnr.samples[1] = width * height;
 
-  width = orig->uv_width;
-  height = orig->uv_height;
+  width = orig->uv_crop_width;
+  height = orig->uv_crop_height;
 
   sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
                          recon->u_buffer, recon->uv_stride,
@@ -1997,7 +2172,7 @@ static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,
 
 static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
   // this frame refreshes means next frames don't unless specified by user
-  cpi->common.frames_since_golden = 0;
+  cpi->frames_since_golden = 0;
 
 #if CONFIG_MULTIPLE_ARF
   if (!cpi->multi_arf_enabled)
@@ -2013,7 +2188,7 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
   if (cpi->refresh_golden_frame) {
     // this frame refreshes means next frames don't unless specified by user
     cpi->refresh_golden_frame = 0;
-    cpi->common.frames_since_golden = 0;
+    cpi->frames_since_golden = 0;
 
     // ******** Fixed Q test code only ************
     // If we are going to use the ALT reference for the next group of frames set a flag to say so.
@@ -2035,10 +2210,10 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
     if (cpi->frames_till_gf_update_due > 0)
       cpi->frames_till_gf_update_due--;
 
-    if (cpi->common.frames_till_alt_ref_frame)
-      cpi->common.frames_till_alt_ref_frame--;
+    if (cpi->frames_till_alt_ref_frame)
+      cpi->frames_till_alt_ref_frame--;
 
-    cpi->common.frames_since_golden++;
+    cpi->frames_since_golden++;
   }
 }
 
@@ -2230,8 +2405,9 @@ static void update_reference_frames(VP9_COMP * const cpi) {
 }
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
-  if (cpi->mb.e_mbd.lossless) {
-    cm->filter_level = 0;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  if (xd->lossless) {
+      xd->lf.filter_level = 0;
   } else {
     struct vpx_usec_timer timer;
 
@@ -2245,53 +2421,13 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
     cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
   }
 
-  if (cm->filter_level > 0) {
-    vp9_set_alt_lf_level(cpi, cm->filter_level);
-    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0);
+  if (xd->lf.filter_level > 0) {
+    vp9_set_alt_lf_level(cpi, xd->lf.filter_level);
+    vp9_loop_filter_frame(cm, xd, xd->lf.filter_level, 0);
   }
 
-  vp9_extend_frame_borders(cm->frame_to_show,
-                           cm->subsampling_x, cm->subsampling_y);
-
-}
-
-void vp9_select_interp_filter_type(VP9_COMP *cpi) {
-  int i;
-  int high_filter_index = 0;
-  unsigned int thresh;
-  unsigned int high_count = 0;
-  unsigned int count_sum = 0;
-  unsigned int *hist = cpi->best_switchable_interp_count;
-
-  if (DEFAULT_INTERP_FILTER != SWITCHABLE) {
-    cpi->common.mcomp_filter_type = DEFAULT_INTERP_FILTER;
-    return;
-  }
-
-  // TODO(agrange): Look at using RD criteria to select the interpolation
-  // filter to use for the next frame rather than this simpler counting scheme.
-
-  // Select the interpolation filter mode for the next frame
-  // based on the selection frequency seen in the current frame.
-  for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-    unsigned int count = hist[i];
-    count_sum += count;
-    if (count > high_count) {
-      high_count = count;
-      high_filter_index = i;
-    }
-  }
-
-  thresh = (unsigned int)(0.80 * count_sum);
-
-  if (high_count > thresh) {
-    // One filter accounts for 80+% of cases so force the next
-    // frame to use this filter exclusively using frame-level flag.
-    cpi->common.mcomp_filter_type = vp9_switchable_interp[high_filter_index];
-  } else {
-    // Use a MB-level switchable filter selection strategy.
-    cpi->common.mcomp_filter_type = SWITCHABLE;
-  }
+  vp9_extend_frame_inner_borders(cm->frame_to_show,
+                                 cm->subsampling_x, cm->subsampling_y);
 }
 
 static void scale_references(VP9_COMP *cpi) {
@@ -2326,6 +2462,31 @@ static void release_scaled_references(VP9_COMP *cpi) {
     cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--;
 }
 
+static void full_to_model_count(unsigned int *model_count,
+                                unsigned int *full_count) {
+  int n;
+  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+  for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
+    model_count[TWO_TOKEN] += full_count[n];
+  model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
+}
+
+static void full_to_model_counts(
+    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
+  int i, j, k, l;
+  for (i = 0; i < BLOCK_TYPES; ++i)
+    for (j = 0; j < REF_TYPES; ++j)
+      for (k = 0; k < COEF_BANDS; ++k)
+        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+          if (l >= 3 && k == 0)
+            continue;
+          full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]);
+        }
+}
+
+
 static void encode_frame_to_data_rate(VP9_COMP *cpi,
                                       unsigned long *size,
                                       unsigned char *dest,
@@ -2351,6 +2512,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   int undershoot_seen = 0;
 
   SPEED_FEATURES *sf = &cpi->sf;
+  unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
 #if RESET_FOREACH_FILTER
   int q_low0;
   int q_high0;
@@ -2392,7 +2554,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
     // per second target bitrate
     cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
-                                  cpi->output_frame_rate);
+                                  cpi->output_framerate);
   }
 
   // Clear zbin over-quant value and mode boost values.
@@ -2421,8 +2583,26 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   }
 
   // Set default state for segment based loop filter update flags
-  xd->mode_ref_lf_delta_update = 0;
-
+  xd->lf.mode_ref_delta_update = 0;
+
+  // Initialize cpi->mv_step_param to default based on max resolution
+  cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
+  // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
+  if (sf->auto_mv_step_size) {
+    if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only) {
+      // initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame)
+        // allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution
+        cpi->mv_step_param = vp9_init_search_range(
+            cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+      cpi->max_mv_magnitude = 0;
+    }
+  }
 
   // Set various flags etc to special state if it is a key frame
   if (cm->frame_type == KEY_FRAME) {
@@ -2432,9 +2612,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     setup_features(cpi);
 
     // If segmentation is enabled force a map update for key frames
-    if (xd->segmentation_enabled) {
-      xd->update_mb_segmentation_map = 1;
-      xd->update_mb_segmentation_data = 1;
+    if (xd->seg.enabled) {
+      xd->seg.update_map = 1;
+      xd->seg.update_data = 1;
     }
 
     // The alternate reference frame cannot be active for a key frame
@@ -2965,35 +3145,36 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   cpi->dummy_packing = 0;
   vp9_pack_bitstream(cpi, dest, size);
 
-  if (xd->update_mb_segmentation_map) {
+  if (xd->seg.update_map)
     update_reference_segmentation_map(cpi);
-  }
 
   release_scaled_references(cpi);
   update_reference_frames(cpi);
 
   for (t = TX_4X4; t <= TX_32X32; t++)
-    vp9_full_to_model_counts(cpi->common.fc.coef_counts[t],
-                             cpi->coef_counts[t]);
+    full_to_model_counts(cpi->common.counts.coef[t],
+                         cpi->coef_counts[t]);
   if (!cpi->common.error_resilient_mode &&
       !cpi->common.frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(&cpi->common);
   }
 
   if (cpi->common.frame_type != KEY_FRAME) {
-    vp9_copy(cpi->common.fc.y_mode_counts, cpi->y_mode_count);
-    vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);
-    vp9_copy(cpi->common.fc.partition_counts, cpi->partition_count);
-    vp9_copy(cm->fc.intra_inter_count, cpi->intra_inter_count);
-    vp9_copy(cm->fc.comp_inter_count, cpi->comp_inter_count);
-    vp9_copy(cm->fc.single_ref_count, cpi->single_ref_count);
-    vp9_copy(cm->fc.comp_ref_count, cpi->comp_ref_count);
-    cpi->common.fc.NMVcount = cpi->NMVcount;
+    FRAME_COUNTS *counts = &cpi->common.counts;
+
+    vp9_copy(counts->y_mode, cpi->y_mode_count);
+    vp9_copy(counts->uv_mode, cpi->y_uv_mode_count);
+    vp9_copy(counts->partition, cpi->partition_count);
+    vp9_copy(counts->intra_inter, cpi->intra_inter_count);
+    vp9_copy(counts->comp_inter, cpi->comp_inter_count);
+    vp9_copy(counts->single_ref, cpi->single_ref_count);
+    vp9_copy(counts->comp_ref, cpi->comp_ref_count);
+    counts->mv = cpi->NMVcount;
     if (!cpi->common.error_resilient_mode &&
         !cpi->common.frame_parallel_decoding_mode) {
       vp9_adapt_mode_probs(&cpi->common);
       vp9_adapt_mode_context(&cpi->common);
-      vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
+      vp9_adapt_mv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
     }
   }
 
@@ -3273,23 +3454,32 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   }
 
   // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
-  xd->update_mb_segmentation_map = 0;
-  xd->update_mb_segmentation_data = 0;
-  xd->mode_ref_lf_delta_update = 0;
+  xd->seg.update_map = 0;
+  xd->seg.update_data = 0;
+  xd->lf.mode_ref_delta_update = 0;
 
   // keep track of the last coded dimensions
   cm->last_width = cm->width;
   cm->last_height = cm->height;
 
-  // Don't increment frame counters if this was an altref buffer
-  // update not a real frame
+  // reset to normal state now that we are done.
   cm->last_show_frame = cm->show_frame;
   if (cm->show_frame) {
+    // current mip will be the prev_mip for the next frame
+    MODE_INFO *temp = cm->prev_mip;
+    cm->prev_mip = cm->mip;
+    cm->mip = temp;
+
+    // update the upper left visible macroblock ptrs
+    cm->mi = cm->mip + cm->mode_info_stride + 1;
+
+    // Don't increment frame counters if this was an altref buffer
+    // update not a real frame
     ++cm->current_video_frame;
     ++cpi->frames_since_key;
   }
-
-  // reset to normal state now that we are done.
+  // restore prev_mi
+  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
 
 #if 0
   {
@@ -3307,17 +3497,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   vp9_write_yuv_rec_frame(cm);
 #endif
 
-  if (cm->show_frame) {
-    vpx_memcpy(cm->prev_mip, cm->mip,
-               cm->mode_info_stride * (cm->mi_rows + 64 / MI_SIZE) *
-               sizeof(MODE_INFO));
-  } else {
-    vpx_memset(cm->prev_mip, 0,
-               cm->mode_info_stride * (cm->mi_rows + 64 / MI_SIZE) *
-               sizeof(MODE_INFO));
-  }
-  // restore prev_mi
-  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
 }
 
 static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
@@ -3327,7 +3506,7 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
     vp9_second_pass(cpi);
 
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
-
+  // vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt");
 #ifdef DISABLE_RC_LONG_TERM_MEM
   cpi->twopass.bits_left -=  cpi->this_frame_target;
 #else
@@ -3335,14 +3514,14 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
 #endif
 
   if (!cpi->refresh_alt_ref_frame) {
-    double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;
+    double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
     double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
                                         * cpi->oxcf.two_pass_vbrmin_section / 100);
 
     if (two_pass_min_rate < lower_bounds_min_rate)
       two_pass_min_rate = lower_bounds_min_rate;
 
-    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.frame_rate);
+    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.framerate);
   }
 }
 
@@ -3368,7 +3547,6 @@ int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
   if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
                          cpi->active_map_enabled ? cpi->active_map : NULL))
     res = -1;
-  cm->clr_type = sd->clrtype;
   vpx_usec_timer_mark(&timer);
   cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
 
@@ -3385,9 +3563,9 @@ static int frame_is_reference(const VP9_COMP *cpi) {
          cpi->refresh_golden_frame ||
          cpi->refresh_alt_ref_frame ||
          cm->refresh_frame_context ||
-         mb->mode_ref_lf_delta_update ||
-         mb->update_mb_segmentation_map ||
-         mb->update_mb_segmentation_data;
+         mb->lf.mode_ref_delta_update ||
+         mb->seg.update_map ||
+         mb->seg.update_data;
 }
 
 #if CONFIG_MULTIPLE_ARF
@@ -3458,7 +3636,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
       cpi->is_src_frame_alt_ref = 0;
 
       // TODO(agrange) This needs to vary depending on where the next ARF is.
-      cm->frames_till_alt_ref_frame = frames_to_arf;
+      cpi->frames_till_alt_ref_frame = frames_to_arf;
 
 #if CONFIG_MULTIPLE_ARF
       if (!cpi->multi_arf_enabled)
@@ -3565,18 +3743,18 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 
     if (this_duration) {
       if (step) {
-        vp9_new_frame_rate(cpi, 10000000.0 / this_duration);
+        vp9_new_framerate(cpi, 10000000.0 / this_duration);
       } else {
         // Average this frame's rate into the last second's average
         // frame rate. If we haven't seen 1 second yet, then average
         // over the whole interval seen.
         const double interval = MIN((double)(cpi->source->ts_end
                                      - cpi->first_time_stamp_ever), 10000000.0);
-        double avg_duration = 10000000.0 / cpi->oxcf.frame_rate;
+        double avg_duration = 10000000.0 / cpi->oxcf.framerate;
         avg_duration *= (interval - avg_duration + this_duration);
         avg_duration /= interval;
 
-        vp9_new_frame_rate(cpi, 10000000.0 / avg_duration);
+        vp9_new_framerate(cpi, 10000000.0 / avg_duration);
       }
     }
 
@@ -3691,16 +3869,16 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
         double sq_error;
 
         ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride,
-                              recon->y_buffer, recon->y_stride, orig->y_width,
-                              orig->y_height);
+                              recon->y_buffer, recon->y_stride,
+                              orig->y_crop_width, orig->y_crop_height);
 
         ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride,
-                              recon->u_buffer, recon->uv_stride, orig->uv_width,
-                              orig->uv_height);
+                              recon->u_buffer, recon->uv_stride,
+                              orig->uv_crop_width, orig->uv_crop_height);
 
         ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride,
-                              recon->v_buffer, recon->uv_stride, orig->uv_width,
-                              orig->uv_height);
+                              recon->v_buffer, recon->uv_stride,
+                              orig->uv_crop_width, orig->uv_crop_height);
 
         sq_error = ye + ue + ve;
 
@@ -3716,21 +3894,21 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
           double weight = 0;
 #if CONFIG_POSTPROC
           vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
-                      cm->filter_level * 10 / 6);
+                      cpi->mb.e_mbd.lf.filter_level * 10 / 6);
 #endif
           vp9_clear_system_state();
 
           ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride,
-                                pp->y_buffer, pp->y_stride, orig->y_width,
-                                orig->y_height);
+                                pp->y_buffer, pp->y_stride,
+                                orig->y_crop_width, orig->y_crop_height);
 
           ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride,
-                                pp->u_buffer, pp->uv_stride, orig->uv_width,
-                                orig->uv_height);
+                                pp->u_buffer, pp->uv_stride,
+                                orig->uv_crop_width, orig->uv_crop_height);
 
           ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride,
-                                pp->v_buffer, pp->uv_stride, orig->uv_width,
-                                orig->uv_height);
+                                pp->v_buffer, pp->uv_stride,
+                                orig->uv_crop_width, orig->uv_crop_height);
 
           sq_error = ye + ue + ve;
 
@@ -3791,7 +3969,7 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
   else {
     int ret;
 #if CONFIG_POSTPROC
-    ret = vp9_post_proc_frame(&cpi->common, dest, flags);
+    ret = vp9_post_proc_frame(&cpi->common, &cpi->mb.e_mbd.lf, dest, flags);
 #else
 
     if (cpi->common.frame_to_show) {
@@ -3811,11 +3989,11 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
 }
 
 int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
-                   unsigned int cols, int delta_q[MAX_MB_SEGMENTS],
-                   int delta_lf[MAX_MB_SEGMENTS],
-                   unsigned int threshold[MAX_MB_SEGMENTS]) {
+                   unsigned int cols, int delta_q[MAX_SEGMENTS],
+                   int delta_lf[MAX_SEGMENTS],
+                   unsigned int threshold[MAX_SEGMENTS]) {
   VP9_COMP *cpi = (VP9_COMP *) comp;
-  signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS];
+  signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS];
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   int i;
 
@@ -3834,23 +4012,23 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
   vp9_enable_segmentation((VP9_PTR)cpi);
 
   // Set up the quan, LF and breakout threshold segment data
-  for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+  for (i = 0; i < MAX_SEGMENTS; i++) {
     feature_data[SEG_LVL_ALT_Q][i] = delta_q[i];
     feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i];
     cpi->segment_encode_breakout[i] = threshold[i];
   }
 
   // Enable the loop and quant changes in the feature mask
-  for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+  for (i = 0; i < MAX_SEGMENTS; i++) {
     if (delta_q[i])
-      vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q);
+      vp9_enable_segfeature(&xd->seg, i, SEG_LVL_ALT_Q);
     else
-      vp9_disable_segfeature(xd, i, SEG_LVL_ALT_Q);
+      vp9_disable_segfeature(&xd->seg, i, SEG_LVL_ALT_Q);
 
     if (delta_lf[i])
-      vp9_enable_segfeature(xd, i, SEG_LVL_ALT_LF);
+      vp9_enable_segfeature(&xd->seg, i, SEG_LVL_ALT_LF);
     else
-      vp9_disable_segfeature(xd, i, SEG_LVL_ALT_LF);
+      vp9_disable_segfeature(&xd->seg, i, SEG_LVL_ALT_LF);
   }
 
   // Initialise the feature data structure
diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h
index f5f1c07..0798927 100644
--- a/libvpx/vp9/encoder/vp9_onyx_int.h
+++ b/libvpx/vp9/encoder/vp9_onyx_int.h
@@ -89,9 +89,7 @@ typedef struct {
   int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
   vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
 
-  vp9_prob tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
-  vp9_prob tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
-  vp9_prob tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
+  struct tx_probs tx_probs;
   vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
 } CODING_CONTEXT;
 
@@ -143,55 +141,52 @@ typedef struct {
   MBGRAPH_MB_STATS *mb_stats;
 } MBGRAPH_FRAME_STATS;
 
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
-  THR_ZEROMV,
-  THR_DC,
-
   THR_NEARESTMV,
-  THR_NEARMV,
-
-  THR_ZEROG,
+  THR_NEARESTA,
   THR_NEARESTG,
+  THR_NEWMV,
+  THR_COMP_NEARESTLA,
+  THR_NEARMV,
+  THR_COMP_NEARESTGA,
 
-  THR_ZEROA,
-  THR_NEARESTA,
+  THR_DC,
 
-  THR_NEARG,
+  THR_NEWG,
+  THR_NEWA,
   THR_NEARA,
 
-  THR_V_PRED,
-  THR_H_PRED,
-  THR_D45_PRED,
-  THR_D135_PRED,
-  THR_D117_PRED,
-  THR_D153_PRED,
-  THR_D27_PRED,
-  THR_D63_PRED,
   THR_TM,
 
-  THR_NEWMV,
-  THR_NEWG,
-  THR_NEWA,
+  THR_COMP_NEARLA,
+  THR_COMP_NEWLA,
+  THR_NEARG,
+  THR_COMP_NEARGA,
+  THR_COMP_NEWGA,
 
   THR_SPLITMV,
   THR_SPLITG,
   THR_SPLITA,
+  THR_COMP_SPLITLA,
+  THR_COMP_SPLITGA,
 
-  THR_B_PRED,
-
+  THR_ZEROMV,
+  THR_ZEROG,
+  THR_ZEROA,
   THR_COMP_ZEROLA,
-  THR_COMP_NEARESTLA,
-  THR_COMP_NEARLA,
-
   THR_COMP_ZEROGA,
-  THR_COMP_NEARESTGA,
-  THR_COMP_NEARGA,
 
-  THR_COMP_NEWLA,
-  THR_COMP_NEWGA,
-
-  THR_COMP_SPLITLA,
-  THR_COMP_SPLITGA,
+  THR_B_PRED,
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D27_PRED,
+  THR_D153_PRED,
+  THR_D63_PRED,
+  THR_D117_PRED,
+  THR_D45_PRED,
 } THR_MODES;
 
 typedef enum {
@@ -200,6 +195,37 @@ typedef enum {
   HEX = 2
 } SEARCH_METHODS;
 
+typedef enum {
+  USE_FULL_RD = 0,
+  USE_LARGESTINTRA,
+  USE_LARGESTINTRA_MODELINTER,
+  USE_LARGESTALL
+} TX_SIZE_SEARCH_METHOD;
+
+typedef enum {
+  // Values should be powers of 2 so that they can be selected as bits of
+  // an integer flags field
+
+  // terminate search early based on distortion so far compared to
+  // qp step, distortion in the neighborhood of the frame, etc.
+  FLAG_EARLY_TERMINATE = 1,
+
+  // skips comp inter modes if the best so far is an intra mode
+  FLAG_SKIP_COMP_BESTINTRA = 2,
+
+  // skips comp inter modes if the best single intermode so far does
+  // not have the same reference as one of the two references being
+  // tested
+  FLAG_SKIP_COMP_REFMISMATCH = 4,
+
+  // skips oblique intra modes if the best so far is an inter mode
+  FLAG_SKIP_INTRA_BESTINTER = 8,
+
+  // skips oblique intra modes  at angles 27, 63, 117, 153 if the best
+  // intra so far is not one of the neighboring directions
+  FLAG_SKIP_INTRA_DIRMISMATCH = 16,
+} MODE_SEARCH_SKIP_LOGIC;
+
 typedef struct {
   int RD;
   SEARCH_METHODS search_method;
@@ -210,53 +236,63 @@ typedef struct {
   int quarter_pixel_search;
   int thresh_mult[MAX_MODES];
   int max_step_search_steps;
-  int first_step;
+  int reduce_first_step_size;
+  int auto_mv_step_size;
   int optimize_coefficients;
   int search_best_filter;
   int static_segmentation;
   int comp_inter_joint_search_thresh;
-  int adpative_rd_thresh;
+  int adaptive_rd_thresh;
+  int skip_encode_sb;
+  int skip_encode_frame;
+  int use_lastframe_partitioning;
+  TX_SIZE_SEARCH_METHOD tx_size_search_method;
+  int use_8tap_always;
+  int use_avoid_tested_higherror;
+  int skip_lots_of_modes;
+  int adjust_thresholds_by_speed;
+  int partition_by_variance;
+  int use_one_partition_size_always;
+  int less_rectangular_check;
+  int use_square_partition_only;
+  int unused_mode_skip_lvl;
+  int reference_masking;
+  BLOCK_SIZE_TYPE always_this_block_size;
+  int use_partitions_greater_than;
+  BLOCK_SIZE_TYPE greater_than_block_size;
+  int use_partitions_less_than;
+  BLOCK_SIZE_TYPE less_than_block_size;
+  int adjust_partitioning_from_last_frame;
+  int last_partitioning_redo_frequency;
+  int disable_splitmv;
+  int using_small_partition_info;
+
+  // Implements various heuristics to skip searching modes
+  // The heuristics selected are based on  flags
+  // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+  unsigned int mode_search_skip_flags;
+  MB_PREDICTION_MODE last_chroma_intra_mode;
+  int use_rd_breakout;
+  int use_uv_intra_rd_estimate;
 } SPEED_FEATURES;
 
-enum BlockSize {
-  BLOCK_4X4,
-  BLOCK_4X8,
-  BLOCK_8X4,
-  BLOCK_8X8,
-  BLOCK_8X16,
-  BLOCK_16X8,
-  BLOCK_16X16,
-  BLOCK_32X32,
-  BLOCK_32X16,
-  BLOCK_16X32,
-  BLOCK_64X32,
-  BLOCK_32X64,
-  BLOCK_64X64,
-  BLOCK_MAX_SB_SEGMENTS,
-};
-
 typedef struct VP9_COMP {
+  DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
 
-  DECLARE_ALIGNED(16, short, y_quant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, unsigned char, y_quant_shift[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, y_zbin[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, y_round[QINDEX_RANGE][16]);
-
-  DECLARE_ALIGNED(16, short, uv_quant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, unsigned char, uv_quant_shift[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, uv_zbin[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, uv_round[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 
 #if CONFIG_ALPHA
-  DECLARE_ALIGNED(16, short, a_quant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, unsigned char, a_quant_shift[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, a_zbin[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, a_round[QINDEX_RANGE][16]);
-
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_a[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, int16_t, a_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, a_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, a_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, a_round[QINDEX_RANGE][8]);
 #endif
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
 
   MACROBLOCK mb;
   VP9_COMMON common;
@@ -274,6 +310,7 @@ typedef struct VP9_COMP {
   YV12_BUFFER_CONFIG *un_scaled_source;
   YV12_BUFFER_CONFIG scaled_source;
 
+  unsigned int frames_till_alt_ref_frame;
   int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
   int source_alt_ref_active;  // an alt ref frame has been encoded and is usable
 
@@ -316,6 +353,9 @@ typedef struct VP9_COMP {
   unsigned int mode_check_freq[MAX_MODES];
   unsigned int mode_test_hit_counts[MAX_MODES];
   unsigned int mode_chosen_counts[MAX_MODES];
+  int64_t unused_mode_skip_mask;
+  int ref_frame_mask;
+  int set_ref_frame_mask;
 
   int rd_thresh_mult[MAX_MODES];
   int rd_baseline_thresh[BLOCK_SIZE_TYPES][MAX_MODES];
@@ -323,17 +363,21 @@ typedef struct VP9_COMP {
   int rd_thresh_freq_fact[BLOCK_SIZE_TYPES][MAX_MODES];
 
   int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
+  // FIXME(rbultje) int64_t?
   int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
   unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];
   unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];
   unsigned int single_ref_count[REF_CONTEXTS][2][2];
   unsigned int comp_ref_count[REF_CONTEXTS][2];
 
-  // FIXME contextualize
-
   int64_t rd_tx_select_diff[NB_TXFM_MODES];
+  // FIXME(rbultje) can this overflow?
   int rd_tx_select_threshes[4][NB_TXFM_MODES];
 
+  int64_t rd_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
+  int64_t rd_filter_threshes[4][VP9_SWITCHABLE_FILTERS + 1];
+  int64_t rd_filter_cache[VP9_SWITCHABLE_FILTERS + 1];
+
   int RDMULT;
   int RDDIV;
 
@@ -349,6 +393,7 @@ typedef struct VP9_COMP {
   double key_frame_rate_correction_factor;
   double gf_rate_correction_factor;
 
+  unsigned int frames_since_golden;
   int frames_till_gf_update_due;      // Count down till next GF
 
   int gf_overspend_bits;            // Total bits overspent becasue of GF boost (cumulative)
@@ -368,7 +413,7 @@ typedef struct VP9_COMP {
   int av_per_frame_bandwidth;        // Average frame size target for clip
   int min_frame_bandwidth;          // Minimum allocation that should be used for any frame
   int inter_frame_target;
-  double output_frame_rate;
+  double output_framerate;
   int64_t last_time_stamp_seen;
   int64_t last_end_time_stamp_seen;
   int64_t first_time_stamp_ever;
@@ -458,6 +503,9 @@ typedef struct VP9_COMP {
   SPEED_FEATURES sf;
   int error_bins[1024];
 
+  unsigned int max_mv_magnitude;
+  int mv_step_param;
+
   // Data used for real time conferencing mode to help determine if it would be good to update the gf
   int inter_zz_count;
   int gf_bad_count;
@@ -466,7 +514,7 @@ typedef struct VP9_COMP {
   unsigned char *segmentation_map;
 
   // segment threashold for encode breakout
-  int  segment_encode_breakout[MAX_MB_SEGMENTS];
+  int  segment_encode_breakout[MAX_SEGMENTS];
 
   unsigned char *active_map;
   unsigned int active_map_enabled;
@@ -475,7 +523,7 @@ typedef struct VP9_COMP {
   vp9_full_search_fn_t full_search_sad;
   vp9_refining_search_fn_t refining_search_sad;
   vp9_diamond_search_fn_t diamond_search_sad;
-  vp9_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SB_SEGMENTS];
+  vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZE_TYPES];
   uint64_t time_receive_data;
   uint64_t time_compress_data;
   uint64_t time_pick_lpf;
@@ -570,7 +618,8 @@ typedef struct VP9_COMP {
 
   unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
                                       [VP9_SWITCHABLE_FILTERS];
-  unsigned int best_switchable_interp_count[VP9_SWITCHABLE_FILTERS];
+
+  unsigned int txfm_stepdown_count[TX_SIZE_MAX_SB];
 
   int initial_width;
   int initial_height;
@@ -617,21 +666,4 @@ extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source,
 
 extern void vp9_alloc_compressor_data(VP9_COMP *cpi);
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval,expr) do {\
-    lval = (expr); \
-    if(!lval) \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
-                         "Failed to allocate "#lval" at %s:%d", \
-                         __FILE__,__LINE__);\
-  } while(0)
-#else
-#define CHECK_MEM_ERROR(lval,expr) do {\
-    lval = (expr); \
-    if(!lval) \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
-                         "Failed to allocate "#lval);\
-  } while(0)
-#endif
-
 #endif  // VP9_ENCODER_VP9_ONYX_INT_H_
diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c
index a87d058..2b8f2cd 100644
--- a/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/libvpx/vp9/encoder/vp9_picklpf.c
@@ -127,6 +127,7 @@ void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {
 
 void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
+  struct loopfilter *lf = &cpi->mb.e_mbd.lf;
 
   int best_err = 0;
   int filt_err = 0;
@@ -135,7 +136,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
 
   int filter_step;
   int filt_high = 0;
-  int filt_mid = cm->filter_level;      // Start search at previous frame filter level
+  // Start search at previous frame filter level
+  int filt_mid = lf->filter_level;
   int filt_low = 0;
   int filt_best;
   int filt_direction = 0;
@@ -146,12 +148,12 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
   vp8_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
 
   if (cm->frame_type == KEY_FRAME)
-    cm->sharpness_level = 0;
+    lf->sharpness_level = 0;
   else
-    cm->sharpness_level = cpi->oxcf.Sharpness;
+    lf->sharpness_level = cpi->oxcf.Sharpness;
 
   // Start the search at the previous frame filter level unless it is now out of range.
-  filt_mid = cm->filter_level;
+  filt_mid = lf->filter_level;
 
   if (filt_mid < min_filter_level)
     filt_mid = min_filter_level;
@@ -179,7 +181,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
       Bias = Bias * cpi->twopass.section_intra_rating / 20;
 
     // yx, bias less for large block size
-    if (cpi->common.txfm_mode != ONLY_4X4)
+    if (cpi->common.tx_mode != ONLY_4X4)
       Bias >>= 1;
 
     filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
@@ -232,5 +234,5 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
     }
   }
 
-  cm->filter_level = filt_best;
+  lf->filter_level = filt_best;
 }
diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c
index 53d8be7..525f4da 100644
--- a/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libvpx/vp9/encoder/vp9_quantize.c
@@ -21,105 +21,145 @@
 extern int enc_debug;
 #endif
 
-static INLINE int plane_idx(int plane) {
-  return plane == 0 ? 0 :
-         plane == 1 ? 16 : 20;
-}
-
-static void quantize(int16_t *zbin_boost_orig_ptr,
-                     int16_t *coeff_ptr, int n_coeffs, int skip_block,
-                     int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
-                     uint8_t *quant_shift_ptr,
-                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                     int16_t *dequant_ptr, int zbin_oq_value,
-                     uint16_t *eob_ptr,
-                     const int *scan, int mul) {
+void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+                      int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
+                      int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+                      int16_t *dqcoeff_ptr, int16_t *dequant_ptr,
+                      int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan,
+                      const int16_t *iscan) {
   int i, rc, eob;
-  int zbin;
+  int zbins[2], nzbins[2], zbin;
   int x, y, z, sz;
-  int zero_run = 0;
-  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
+  int zero_flag = n_coeffs;
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
   vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
 
   eob = -1;
 
+  // Base ZBIN
+  zbins[0] = zbin_ptr[0] + zbin_oq_value;
+  zbins[1] = zbin_ptr[1] + zbin_oq_value;
+  nzbins[0] = zbins[0] * -1;
+  nzbins[1] = zbins[1] * -1;
+
   if (!skip_block) {
-    for (i = 0; i < n_coeffs; i++) {
-      rc   = scan[i];
-      z    = coeff_ptr[rc] * mul;
+    // Pre-scan pass
+    for (i = n_coeffs - 1; i >= 0; i--) {
+      rc = scan[i];
+      z = coeff_ptr[rc];
+
+      if (z < zbins[rc != 0] && z > nzbins[rc != 0]) {
+        zero_flag--;
+      } else {
+        break;
+      }
+    }
 
-      zbin = (zbin_ptr[rc != 0] + zbin_boost_ptr[zero_run] + zbin_oq_value);
-      zero_run += (zero_run < 15);
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < zero_flag; i++) {
+      rc = scan[i];
+      z  = coeff_ptr[rc];
+
+      zbin = (zbins[rc != 0]);
 
       sz = (z >> 31);                               // sign of z
-      x  = (z ^ sz) - sz;                           // x = abs(z)
+      x  = (z ^ sz) - sz;
 
       if (x >= zbin) {
         x += (round_ptr[rc != 0]);
-        y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
-            >> quant_shift_ptr[rc != 0];            // quantize (x)
+        y  = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
+              quant_shift_ptr[rc != 0]) >> 16;      // quantize (x)
         x  = (y ^ sz) - sz;                         // get the sign back
         qcoeff_ptr[rc]  = x;                        // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0];  // dequantized value
 
         if (y) {
           eob = i;                                  // last nonzero coeffs
-          zero_run = 0;
         }
       }
     }
   }
-
   *eob_ptr = eob + 1;
 }
 
-void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
-                  TX_TYPE tx_type) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  const int mul = n_coeffs == 1024 ? 2 : 1;
-  const int *scan;
-
-  // These contexts may be available in the caller
-  switch (n_coeffs) {
-    case 4 * 4:
-      scan = get_scan_4x4(tx_type);
-      break;
-    case 8 * 8:
-      scan = get_scan_8x8(tx_type);
-      break;
-    case 16 * 16:
-      scan = get_scan_16x16(tx_type);
-      break;
-    default:
-      scan = vp9_default_scan_32x32;
-      break;
-  }
+// This function works well for large transform size.
+void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block,
+                            int16_t *zbin_ptr, int16_t *round_ptr,
+                            int16_t *quant_ptr, int16_t *quant_shift_ptr,
+                            int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                            int16_t *dequant_ptr, int zbin_oq_value,
+                            uint16_t *eob_ptr, const int16_t *scan,
+                            const int16_t *iscan) {
+  int i, rc, eob;
+  int zbins[2], nzbins[2], zbin;
+  int x, y, z, sz;
+  int idx = 0;
+  int idx_arr[1024];
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+
+  eob = -1;
+
+  // Base ZBIN
+  zbins[0] = zbin_ptr[0] + zbin_oq_value;
+  zbins[1] = zbin_ptr[1] + zbin_oq_value;
+  nzbins[0] = zbins[0] * -1;
+  nzbins[1] = zbins[1] * -1;
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      rc = scan[i];
+      z = coeff_ptr[rc] * 2;
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (z >= zbins[rc != 0] || z <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
 
-  quantize(mb->plane[plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
-           n_coeffs, mb->skip_block,
-           mb->plane[plane].zbin,
-           mb->plane[plane].round,
-           mb->plane[plane].quant,
-           mb->plane[plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
-           BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-           xd->plane[plane].dequant,
-           mb->plane[plane].zbin_extra,
-           &xd->plane[plane].eobs[block],
-           scan, mul);
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      rc = scan[idx_arr[i]];
+
+      // Calculate ZBIN
+      zbin = (zbins[rc != 0]);
+
+      z = coeff_ptr[rc] * 2;
+      sz = (z >> 31);                               // sign of z
+      x  = (z ^ sz) - sz;                           // x = abs(z)
+
+      if (x >= zbin) {
+        x += (round_ptr[rc != 0]);
+        y  = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
+              quant_shift_ptr[rc != 0]) >> 16;      // quantize (x)
+
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc]  = x;                        // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2;  // dequantized value
+
+        if (y) {
+          eob = idx_arr[i];                         // last nonzero coeffs
+        }
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
 }
 
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  const int *pt_scan = get_scan_4x4(tx_type);
+  const int16_t *scan = get_scan_4x4(tx_type);
+  const int16_t *iscan = get_iscan_4x4(tx_type);
 
-  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
+  vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
            16, mb->skip_block,
            mb->plane[pb_idx.plane].zbin,
            mb->plane[pb_idx.plane].round,
@@ -130,10 +170,10 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
            xd->plane[pb_idx.plane].dequant,
            mb->plane[pb_idx.plane].zbin_extra,
            &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           pt_scan, 1);
+           scan, iscan);
 }
 
-static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
   unsigned t;
   int l;
   t = d;
@@ -141,7 +181,7 @@ static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
     t >>= 1;
   t = 1 + (1 << (16 + l)) / d;
   *quant = (int16_t)(t - (1 << 16));
-  *shift = l;
+  *shift = 1 << (16 - l);
 }
 
 void vp9_init_quantizer(VP9_COMP *cpi) {
@@ -153,9 +193,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
 #endif
   int q;
 
-  static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,
-                                     14, 16, 20, 24, 28, 32, 36, 40 };
-
   for (q = 0; q < QINDEX_RANGE; q++) {
     int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80;
     int qrounding_factor = 48;
@@ -163,20 +200,19 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       qzbin_factor = 64;
       qrounding_factor = 64;
     }
+
     // dc values
     quant_val = vp9_dc_quant(q, cpi->common.y_dc_delta_q);
     invert_quant(cpi->y_quant[q] + 0, cpi->y_quant_shift[q] + 0, quant_val);
     cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
     cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7;
     cpi->common.y_dequant[q][0] = quant_val;
-    cpi->zrun_zbin_boost_y[q][0] = (quant_val * zbin_boost[0]) >> 7;
 
     quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q);
     invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val);
     cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
     cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7;
     cpi->common.uv_dequant[q][0] = quant_val;
-    cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;
 
 #if CONFIG_ALPHA
     quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q);
@@ -184,42 +220,49 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
     cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
     cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7;
     cpi->common.a_dequant[q][0] = quant_val;
-    cpi->zrun_zbin_boost_a[q][0] = (quant_val * zbin_boost[0]) >> 7;
 #endif
 
     quant_val = vp9_ac_quant(q, 0);
+    invert_quant(cpi->y_quant[q] + 1, cpi->y_quant_shift[q] + 1, quant_val);
+    cpi->y_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+    cpi->y_round[q][1] = (qrounding_factor * quant_val) >> 7;
     cpi->common.y_dequant[q][1] = quant_val;
+
     quant_uv_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);
+    invert_quant(cpi->uv_quant[q] + 1, cpi->uv_quant_shift[q] + 1,
+                 quant_uv_val);
+    cpi->uv_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
+    cpi->uv_round[q][1] = (qrounding_factor * quant_uv_val) >> 7;
     cpi->common.uv_dequant[q][1] = quant_uv_val;
+
 #if CONFIG_ALPHA
     quant_alpha_val = vp9_ac_quant(q, cpi->common.a_ac_delta_q);
+    invert_quant(cpi->a_quant[q] + 1, cpi->a_quant_shift[q] + 1,
+                 quant_alpha_val);
+    cpi->a_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);
+    cpi->a_round[q][1] = (qrounding_factor * quant_alpha_val) >> 7;
     cpi->common.a_dequant[q][1] = quant_alpha_val;
 #endif
-    // all the 4x4 ac values =;
-    for (i = 1; i < 16; i++) {
-      int rc = vp9_default_scan_4x4[i];
-
-      invert_quant(cpi->y_quant[q] + rc, cpi->y_quant_shift[q] + rc, quant_val);
-      cpi->y_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-      cpi->y_round[q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->zrun_zbin_boost_y[q][i] =
-          ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
-
-      invert_quant(cpi->uv_quant[q] + rc, cpi->uv_quant_shift[q] + rc,
-        quant_uv_val);
-      cpi->uv_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
-      cpi->uv_round[q][rc] = (qrounding_factor * quant_uv_val) >> 7;
-      cpi->zrun_zbin_boost_uv[q][i] =
-          ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);
+
+    for (i = 2; i < 8; i++) {
+      cpi->y_quant[q][i] = cpi->y_quant[q][1];
+      cpi->y_quant_shift[q][i] = cpi->y_quant_shift[q][1];
+      cpi->y_zbin[q][i] = cpi->y_zbin[q][1];
+      cpi->y_round[q][i] = cpi->y_round[q][1];
+      cpi->common.y_dequant[q][i] = cpi->common.y_dequant[q][1];
+
+      cpi->uv_quant[q][i] = cpi->uv_quant[q][1];
+      cpi->uv_quant_shift[q][i] = cpi->uv_quant_shift[q][1];
+      cpi->uv_zbin[q][i] = cpi->uv_zbin[q][1];
+      cpi->uv_round[q][i] = cpi->uv_round[q][1];
+      cpi->common.uv_dequant[q][i] = cpi->common.uv_dequant[q][1];
 
 #if CONFIG_ALPHA
-      invert_quant(cpi->a_quant[q] + rc, cpi->a_quant_shift[q] + rc,
-          quant_alpha_val);
-      cpi->a_zbin[q][rc] =
-          ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);
-      cpi->a_round[q][rc] = (qrounding_factor * quant_alpha_val) >> 7;
-      cpi->zrun_zbin_boost_a[q][i] =
-          ROUND_POWER_OF_TWO(quant_alpha_val * zbin_boost[i], 7);
+      cpi->a_quant[q][i] = cpi->a_quant[q][1];
+      cpi->a_quant_shift[q][i] = cpi->a_quant_shift[q][1];
+      cpi->a_zbin[q][i] = cpi->a_zbin[q][1];
+      cpi->a_round[q][i] = cpi->a_round[q][1];
+      cpi->common.a_dequant[q][i] = cpi->common.a_dequant[q][1];
 #endif
     }
   }
@@ -240,7 +283,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
   x->plane[0].quant_shift = cpi->y_quant_shift[qindex];
   x->plane[0].zbin = cpi->y_zbin[qindex];
   x->plane[0].round = cpi->y_round[qindex];
-  x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y[qindex];
   x->plane[0].zbin_extra = (int16_t)zbin_extra;
   x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex];
 
@@ -253,7 +295,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
     x->plane[i].quant_shift = cpi->uv_quant_shift[qindex];
     x->plane[i].zbin = cpi->uv_zbin[qindex];
     x->plane[i].round = cpi->uv_round[qindex];
-    x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex];
     x->plane[i].zbin_extra = (int16_t)zbin_extra;
     x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];
   }
@@ -263,12 +304,11 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
   x->plane[3].quant_shift = cpi->a_quant_shift[qindex];
   x->plane[3].zbin = cpi->a_zbin[qindex];
   x->plane[3].round = cpi->a_round[qindex];
-  x->plane[3].zrun_zbin_boost = cpi->zrun_zbin_boost_a[qindex];
   x->plane[3].zbin_extra = (int16_t)zbin_extra;
   x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];
 #endif
 
-  x->skip_block = vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
+  x->skip_block = vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP);
 
   /* save this macroblock QIndex for vp9_update_zbin_extra() */
   x->e_mbd.q_index = qindex;
diff --git a/libvpx/vp9/encoder/vp9_quantize.h b/libvpx/vp9/encoder/vp9_quantize.h
index 2b1eeab..3229eaa 100644
--- a/libvpx/vp9/encoder/vp9_quantize.h
+++ b/libvpx/vp9/encoder/vp9_quantize.h
@@ -22,9 +22,6 @@
 #define prototype_quantize_mb(sym) \
   void (sym)(MACROBLOCK *x)
 
-void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coefs,
-                  TX_TYPE tx_type);
-
 void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,
                                      int y_blocks);
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c
index 430d3a8..d3a9529 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -17,7 +17,6 @@
 #include <math.h>
 
 #include "vp9/common/vp9_alloccommon.h"
-#include "vp9/common/vp9_modecont.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/common/vp9_entropymode.h"
@@ -33,46 +32,8 @@
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS    9
 
-// % adjustment to target kf size based on seperation from previous frame
-static const int kf_boost_seperation_adjustment[16] = {
-  30,   40,   50,   55,   60,   65,   70,   75,
-  80,   85,   90,   95,  100,  100,  100,  100,
-};
-
-static const int gf_adjust_table[101] = {
-  100,
-  115, 130, 145, 160, 175, 190, 200, 210, 220, 230,
-  240, 260, 270, 280, 290, 300, 310, 320, 330, 340,
-  350, 360, 370, 380, 390, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-};
-
-static const int gf_intra_usage_adjustment[20] = {
-  125, 120, 115, 110, 105, 100,  95,  85,  80,  75,
-  70,  65,  60,  55,  50,  50,  50,  50,  50,  50,
-};
-
-static const int gf_interval_table[101] = {
-  7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-};
-
-static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 };
+static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] =
+    { 1, 2, 3, 4, 5 };
 
 // These functions use formulaic calculations to make playing with the
 // quantizer tables easier. If necessary they can be replaced by lookup
@@ -128,7 +89,7 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
   vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
   vp9_copy(cc->partition_prob, cm->fc.partition_prob);
 
-  vp9_copy(cc->segment_pred_probs, cm->segment_pred_probs);
+  vp9_copy(cc->segment_pred_probs, xd->seg.pred_probs);
 
   vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob);
   vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob);
@@ -138,14 +99,12 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
   vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
              cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
 
-  vp9_copy(cc->last_ref_lf_deltas, xd->last_ref_lf_deltas);
-  vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas);
+  vp9_copy(cc->last_ref_lf_deltas, xd->lf.last_ref_deltas);
+  vp9_copy(cc->last_mode_lf_deltas, xd->lf.last_mode_deltas);
 
   vp9_copy(cc->coef_probs, cm->fc.coef_probs);
   vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
-  vp9_copy(cc->tx_probs_8x8p, cm->fc.tx_probs_8x8p);
-  vp9_copy(cc->tx_probs_16x16p, cm->fc.tx_probs_16x16p);
-  vp9_copy(cc->tx_probs_32x32p, cm->fc.tx_probs_32x32p);
+  cc->tx_probs = cm->fc.tx_probs;
   vp9_copy(cc->mbskip_probs, cm->fc.mbskip_probs);
 }
 
@@ -168,7 +127,7 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
   vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
   vp9_copy(cm->fc.partition_prob, cc->partition_prob);
 
-  vp9_copy(cm->segment_pred_probs, cc->segment_pred_probs);
+  vp9_copy(xd->seg.pred_probs, cc->segment_pred_probs);
 
   vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob);
   vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob);
@@ -179,14 +138,12 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
              cpi->coding_context.last_frame_seg_map_copy,
              (cm->mi_rows * cm->mi_cols));
 
-  vp9_copy(xd->last_ref_lf_deltas, cc->last_ref_lf_deltas);
-  vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas);
+  vp9_copy(xd->lf.last_ref_deltas, cc->last_ref_lf_deltas);
+  vp9_copy(xd->lf.last_mode_deltas, cc->last_mode_lf_deltas);
 
   vp9_copy(cm->fc.coef_probs, cc->coef_probs);
   vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
-  vp9_copy(cm->fc.tx_probs_8x8p, cc->tx_probs_8x8p);
-  vp9_copy(cm->fc.tx_probs_16x16p, cc->tx_probs_16x16p);
-  vp9_copy(cm->fc.tx_probs_32x32p, cc->tx_probs_32x32p);
+  cm->fc.tx_probs = cc->tx_probs;
   vp9_copy(cm->fc.mbskip_probs, cc->mbskip_probs);
 }
 
@@ -456,7 +413,7 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) {
      * whichever is smaller.
      */
     int key_freq = cpi->oxcf.key_freq > 0 ? cpi->oxcf.key_freq : 1;
-    av_key_frame_frequency = (int)cpi->output_frame_rate * 2;
+    av_key_frame_frequency = (int)cpi->output_framerate * 2;
 
     if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
       av_key_frame_frequency = cpi->oxcf.key_freq;
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index 9cb7ab0..843cf3f 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -53,56 +53,49 @@ DECLARE_ALIGNED(16, extern const uint8_t,
 #define SPLITMV 0x10000
 
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  {ZEROMV,    LAST_FRAME,   NONE},
-  {DC_PRED,   INTRA_FRAME,  NONE},
-
   {NEARESTMV, LAST_FRAME,   NONE},
-  {NEARMV,    LAST_FRAME,   NONE},
-
-  {ZEROMV,    GOLDEN_FRAME, NONE},
+  {NEARESTMV, ALTREF_FRAME, NONE},
   {NEARESTMV, GOLDEN_FRAME, NONE},
+  {NEWMV,     LAST_FRAME,   NONE},
+  {NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
+  {NEARMV,    LAST_FRAME,   NONE},
+  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
 
-  {ZEROMV,    ALTREF_FRAME, NONE},
-  {NEARESTMV, ALTREF_FRAME, NONE},
+  {DC_PRED,   INTRA_FRAME,  NONE},
 
-  {NEARMV,    GOLDEN_FRAME, NONE},
+  {NEWMV,     GOLDEN_FRAME, NONE},
+  {NEWMV,     ALTREF_FRAME, NONE},
   {NEARMV,    ALTREF_FRAME, NONE},
 
-  {V_PRED,    INTRA_FRAME,  NONE},
-  {H_PRED,    INTRA_FRAME,  NONE},
-  {D45_PRED,  INTRA_FRAME,  NONE},
-  {D135_PRED, INTRA_FRAME,  NONE},
-  {D117_PRED, INTRA_FRAME,  NONE},
-  {D153_PRED, INTRA_FRAME,  NONE},
-  {D27_PRED,  INTRA_FRAME,  NONE},
-  {D63_PRED,  INTRA_FRAME,  NONE},
-
   {TM_PRED,   INTRA_FRAME,  NONE},
 
-  {NEWMV,     LAST_FRAME,   NONE},
-  {NEWMV,     GOLDEN_FRAME, NONE},
-  {NEWMV,     ALTREF_FRAME, NONE},
+  {NEARMV,    LAST_FRAME,   ALTREF_FRAME},
+  {NEWMV,     LAST_FRAME,   ALTREF_FRAME},
+  {NEARMV,    GOLDEN_FRAME, NONE},
+  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
+  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
 
   {SPLITMV,   LAST_FRAME,   NONE},
   {SPLITMV,   GOLDEN_FRAME, NONE},
   {SPLITMV,   ALTREF_FRAME, NONE},
+  {SPLITMV,   LAST_FRAME,   ALTREF_FRAME},
+  {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
 
-  {I4X4_PRED, INTRA_FRAME,  NONE},
-
-  /* compound prediction modes */
+  {ZEROMV,    LAST_FRAME,   NONE},
+  {ZEROMV,    GOLDEN_FRAME, NONE},
+  {ZEROMV,    ALTREF_FRAME, NONE},
   {ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
-  {NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
-  {NEARMV,    LAST_FRAME,   ALTREF_FRAME},
-
   {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
-  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
-  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
-
-  {NEWMV,     LAST_FRAME,   ALTREF_FRAME},
-  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
 
-  {SPLITMV,   LAST_FRAME,   ALTREF_FRAME},
-  {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
+  {I4X4_PRED, INTRA_FRAME,  NONE},
+  {H_PRED,    INTRA_FRAME,  NONE},
+  {V_PRED,    INTRA_FRAME,  NONE},
+  {D135_PRED, INTRA_FRAME,  NONE},
+  {D27_PRED,  INTRA_FRAME,  NONE},
+  {D153_PRED, INTRA_FRAME,  NONE},
+  {D63_PRED,  INTRA_FRAME,  NONE},
+  {D117_PRED, INTRA_FRAME,  NONE},
+  {D45_PRED,  INTRA_FRAME,  NONE},
 };
 
 // The baseline rd thresholds for breaking out of the rd loop for
@@ -116,8 +109,7 @@ static int rd_thresh_block_size_factor[BLOCK_SIZE_TYPES] =
 #define MAX_RD_THRESH_FREQ_FACT 32
 #define MAX_RD_THRESH_FREQ_INC 1
 
-static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES],
-                             vp9_coeff_count (*cnoskip)[BLOCK_TYPES],
+static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES][2],
                              vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
   int i, j, k, l;
   TX_SIZE t;
@@ -128,26 +120,21 @@ static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES],
           for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
             vp9_prob probs[ENTROPY_NODES];
             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
-            vp9_cost_tokens((int *)cnoskip[t][i][j][k][l], probs,
+            vp9_cost_tokens((int *)c[t][i][j][0][k][l], probs,
                             vp9_coef_tree);
-#if CONFIG_BALANCED_COEFTREE
-            // Replace the eob node prob with a very small value so that the
-            // cost approximately equals the cost without the eob node
-            probs[1] = 1;
-            vp9_cost_tokens((int *)c[t][i][j][k][l], probs, vp9_coef_tree);
-#else
-            vp9_cost_tokens_skip((int *)c[t][i][j][k][l], probs,
+            vp9_cost_tokens_skip((int *)c[t][i][j][1][k][l], probs,
                                  vp9_coef_tree);
-            assert(c[t][i][j][k][l][DCT_EOB_TOKEN] ==
-                   cnoskip[t][i][j][k][l][DCT_EOB_TOKEN]);
-#endif
+            assert(c[t][i][j][0][k][l][DCT_EOB_TOKEN] ==
+                   c[t][i][j][1][k][l][DCT_EOB_TOKEN]);
           }
 }
 
-static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, };
+static const int rd_iifactor[32] = {
+  4, 4, 3, 2, 1, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+};
 
 // 3* dc_qlookup[Q]*dc_qlookup[Q];
 
@@ -227,7 +214,11 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
           cpi->rd_threshes[bsize][i] = INT_MAX;
         }
         cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
-        cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
+
+        if (cpi->sf.adaptive_rd_thresh)
+          cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
+        else
+          cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
       }
     }
   } else {
@@ -247,14 +238,16 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
           cpi->rd_threshes[bsize][i] = INT_MAX;
         }
         cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
-        cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
+
+        if (cpi->sf.adaptive_rd_thresh)
+          cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
+        else
+          cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
       }
     }
   }
 
-  fill_token_costs(cpi->mb.token_costs,
-                   cpi->mb.token_costs_noskip,
-                   cpi->common.fc.coef_probs);
+  fill_token_costs(cpi->mb.token_costs, cpi->common.fc.coef_probs);
 
   for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
     vp9_cost_tokens(cpi->mb.partition_cost[i],
@@ -271,168 +264,619 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
         cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
         &cpi->common.fc.nmvc,
         cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);
+
+    for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
+      MB_PREDICTION_MODE m;
+
+      for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
+        cpi->mb.inter_mode_cost[i][m - NEARESTMV] =
+            cost_token(vp9_inter_mode_tree,
+                       cpi->common.fc.inter_mode_probs[i],
+                       vp9_inter_mode_encodings - NEARESTMV + m);
+    }
+  }
+}
+
+static INLINE BLOCK_SIZE_TYPE get_block_size(int bwl, int bhl) {
+  return bsize_from_dim_lookup[bwl][bhl];
+}
+
+static BLOCK_SIZE_TYPE get_plane_block_size(BLOCK_SIZE_TYPE bsize,
+                                            struct macroblockd_plane *pd) {
+  return get_block_size(plane_block_width_log2by4(bsize, pd),
+                        plane_block_height_log2by4(bsize, pd));
+}
+
+static INLINE void linear_interpolate2(double x, int ntab, int inv_step,
+                                       const double *tab1, const double *tab2,
+                                       double *v1, double *v2) {
+  double y = x * inv_step;
+  int d = (int) y;
+  if (d >= ntab - 1) {
+    *v1 = tab1[ntab - 1];
+    *v2 = tab2[ntab - 1];
+  } else {
+    double a = y - d;
+    *v1 = tab1[d] * (1 - a) + tab1[d + 1] * a;
+    *v2 = tab2[d] * (1 - a) + tab2[d + 1] * a;
   }
 }
 
-int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) {
-  int i, error = 0;
+static void model_rd_norm(double x, double *R, double *D) {
+  static const int inv_tab_step = 8;
+  static const int tab_size = 120;
+  // NOTE: The tables below must be of the same size
+  //
+  // Normalized rate
+  // This table models the rate for a Laplacian source
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expression is:
+  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+  // and H(x) is the binary entropy function.
+  static const double rate_tab[] = {
+    64.00, 4.944, 3.949, 3.372, 2.966, 2.655, 2.403, 2.194,
+    2.014, 1.858, 1.720, 1.596, 1.485, 1.384, 1.291, 1.206,
+    1.127, 1.054, 0.986, 0.923, 0.863, 0.808, 0.756, 0.708,
+    0.662, 0.619, 0.579, 0.541, 0.506, 0.473, 0.442, 0.412,
+    0.385, 0.359, 0.335, 0.313, 0.291, 0.272, 0.253, 0.236,
+    0.220, 0.204, 0.190, 0.177, 0.165, 0.153, 0.142, 0.132,
+    0.123, 0.114, 0.106, 0.099, 0.091, 0.085, 0.079, 0.073,
+    0.068, 0.063, 0.058, 0.054, 0.050, 0.047, 0.043, 0.040,
+    0.037, 0.034, 0.032, 0.029, 0.027, 0.025, 0.023, 0.022,
+    0.020, 0.019, 0.017, 0.016, 0.015, 0.014, 0.013, 0.012,
+    0.011, 0.010, 0.009, 0.008, 0.008, 0.007, 0.007, 0.006,
+    0.006, 0.005, 0.005, 0.005, 0.004, 0.004, 0.004, 0.003,
+    0.003, 0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002,
+    0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
+    0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.000,
+  };
+  // Normalized distortion
+  // This table models the normalized distortion for a Laplacian source
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expression is:
+  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+  // where x = qpstep / sqrt(variance)
+  // Note the actual distortion is Dn * variance.
+  static const double dist_tab[] = {
+    0.000, 0.001, 0.005, 0.012, 0.021, 0.032, 0.045, 0.061,
+    0.079, 0.098, 0.119, 0.142, 0.166, 0.190, 0.216, 0.242,
+    0.269, 0.296, 0.324, 0.351, 0.378, 0.405, 0.432, 0.458,
+    0.484, 0.509, 0.534, 0.557, 0.580, 0.603, 0.624, 0.645,
+    0.664, 0.683, 0.702, 0.719, 0.735, 0.751, 0.766, 0.780,
+    0.794, 0.807, 0.819, 0.830, 0.841, 0.851, 0.861, 0.870,
+    0.878, 0.886, 0.894, 0.901, 0.907, 0.913, 0.919, 0.925,
+    0.930, 0.935, 0.939, 0.943, 0.947, 0.951, 0.954, 0.957,
+    0.960, 0.963, 0.966, 0.968, 0.971, 0.973, 0.975, 0.976,
+    0.978, 0.980, 0.981, 0.982, 0.984, 0.985, 0.986, 0.987,
+    0.988, 0.989, 0.990, 0.990, 0.991, 0.992, 0.992, 0.993,
+    0.993, 0.994, 0.994, 0.995, 0.995, 0.996, 0.996, 0.996,
+    0.996, 0.997, 0.997, 0.997, 0.997, 0.998, 0.998, 0.998,
+    0.998, 0.998, 0.998, 0.999, 0.999, 0.999, 0.999, 0.999,
+    0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1.000,
+  };
+  /*
+  assert(sizeof(rate_tab) == tab_size * sizeof(rate_tab[0]);
+  assert(sizeof(dist_tab) == tab_size * sizeof(dist_tab[0]);
+  assert(sizeof(rate_tab) == sizeof(dist_tab));
+  */
+  assert(x >= 0.0);
+  linear_interpolate2(x, tab_size, inv_tab_step,
+                      rate_tab, dist_tab, R, D);
+}
+
+static void model_rd_from_var_lapndz(int var, int n, int qstep,
+                                     int *rate, int64_t *dist) {
+  // This function models the rate and distortion for a Laplacian
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expressions are in:
+  // Hang and Chen, "Source Model for transform video coder and its
+  // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+  // Sys. for Video Tech., April 1997.
+  vp9_clear_system_state();
+  if (var == 0 || n == 0) {
+    *rate = 0;
+    *dist = 0;
+  } else {
+    double D, R;
+    double s2 = (double) var / n;
+    double x = qstep / sqrt(s2);
+    model_rd_norm(x, &R, &D);
+    *rate = ((n << 8) * R + 0.5);
+    *dist = (var * D + 0.5);
+  }
+  vp9_clear_system_state();
+}
+
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd,
+                            int *out_rate_sum, int64_t *out_dist_sum) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  int i, rate_sum = 0, dist_sum = 0;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+
+    // TODO(dkovalev) the same code in get_plane_block_size
+    const int bwl = plane_block_width_log2by4(bsize, pd);
+    const int bhl = plane_block_height_log2by4(bsize, pd);
+    const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl);
+    unsigned int sse;
+    int rate;
+    int64_t dist;
+    (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                              pd->dst.buf, pd->dst.stride, &sse);
+    // sse works better than var, since there is no dc prediction used
+    model_rd_from_var_lapndz(sse, 16 << (bwl + bhl),
+                             pd->dequant[1] >> 3, &rate, &dist);
+
+    rate_sum += rate;
+    dist_sum += dist;
+  }
+
+  *out_rate_sum = rate_sum;
+  *out_dist_sum = dist_sum << 4;
+}
+
+static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+                              MACROBLOCK *x, MACROBLOCKD *xd,
+                              int *out_rate_sum, int64_t *out_dist_sum) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+
+  // TODO(dkovalev) the same code in get_plane_block_size
+  const int bwl = plane_block_width_log2by4(bsize, pd);
+  const int bhl = plane_block_height_log2by4(bsize, pd);
+  const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl);
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                            pd->dst.buf, pd->dst.stride, &sse);
+  // sse works better than var, since there is no dc prediction used
+  model_rd_from_var_lapndz(sse, 16 << (bwl + bhl),
+                           pd->dequant[1] >> 3, &rate, &dist);
+
+  *out_rate_sum = rate;
+  *out_dist_sum = dist << 4;
+}
+
+static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+                                 TX_SIZE tx_size,
+                                 MACROBLOCK *x, MACROBLOCKD *xd,
+                                 int *out_rate_sum, int64_t *out_dist_sum,
+                                 int *out_skip) {
+  int t = 4, j, k;
+  BLOCK_SIZE_TYPE bs = BLOCK_SIZE_AB4X4;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int width = plane_block_width(bsize, pd);
+  const int height = plane_block_height(bsize, pd);
+  int rate_sum = 0;
+  int64_t dist_sum = 0;
+
+  if (tx_size == TX_4X4) {
+    bs = BLOCK_4X4;
+    t = 4;
+  } else if (tx_size == TX_8X8) {
+    bs = BLOCK_8X8;
+    t = 8;
+  } else if (tx_size == TX_16X16) {
+    bs = BLOCK_16X16;
+    t = 16;
+  } else if (tx_size == TX_32X32) {
+    bs = BLOCK_32X32;
+    t = 32;
+  } else {
+    assert(0);
+  }
+  *out_skip = 1;
+  for (j = 0; j < height; j += t) {
+    for (k = 0; k < width; k += t) {
+      int rate;
+      int64_t dist;
+      unsigned int sse;
+      (void) cpi->fn_ptr[bs].vf(p->src.buf + j * p->src.stride + k,
+                                p->src.stride,
+                                pd->dst.buf + j * pd->dst.stride + k,
+                                pd->dst.stride, &sse);
+      // sse works better than var, since there is no dc prediction used
+      model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,
+                               &rate, &dist);
+      rate_sum += rate;
+      dist_sum += dist;
+      *out_skip &= (rate < 1024);
+    }
+  }
+  *out_rate_sum = rate_sum;
+  *out_dist_sum = (dist_sum << 4);
+}
+
+int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
+                          intptr_t block_size, int64_t *ssz) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
 
   for (i = 0; i < block_size; i++) {
     int this_diff = coeff[i] - dqcoeff[i];
-    error += this_diff * this_diff;
+    error += (unsigned)this_diff * this_diff;
+    sqcoeff += (unsigned) coeff[i] * coeff[i];
   }
 
+  *ssz = sqcoeff;
   return error;
 }
 
+static const int16_t band_counts[TX_SIZE_MAX_SB][8] = {
+  { 1, 2, 3, 4,  3,   16 - 13 },
+  { 1, 2, 3, 4, 11,   64 - 21 },
+  { 1, 2, 3, 4, 11,  256 - 21 },
+  { 1, 2, 3, 4, 11, 1024 - 21 },
+};
+
 static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
                               int plane, int block, PLANE_TYPE type,
-                              ENTROPY_CONTEXT *A,
-                              ENTROPY_CONTEXT *L,
+                              ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
                               TX_SIZE tx_size,
-                              int y_blocks) {
+                              const int16_t *scan, const int16_t *nb) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  int pt;
-  int c = 0;
-  int cost = 0, pad;
-  const int *scan, *nb;
+  int pt, c, cost;
+  const int16_t *band_count = band_counts[tx_size];
   const int eob = xd->plane[plane].eobs[block];
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff,
-                                           block, 16);
+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
-  unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
-      mb->token_costs[tx_size][type][ref];
-  ENTROPY_CONTEXT above_ec, left_ec;
-  TX_TYPE tx_type = DCT_DCT;
-
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  unsigned int (*token_costs_noskip)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
-      mb->token_costs_noskip[tx_size][type][ref];
-
-  int seg_eob, default_eob;
+  unsigned int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS]
+                    [MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref];
+  ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
   uint8_t token_cache[1024];
-  const uint8_t * band_translate;
 
   // Check for consistency of tx_size with mode info
   assert((!type && !plane) || (type && plane));
   if (type == PLANE_TYPE_Y_WITH_DC) {
     assert(xd->mode_info_context->mbmi.txfm_size == tx_size);
   } else {
-    TX_SIZE tx_size_uv = get_uv_tx_size(mbmi);
-    assert(tx_size == tx_size_uv);
+    assert(tx_size == get_uv_tx_size(mbmi));
   }
 
+  pt = combine_entropy_contexts(above_ec, left_ec);
+
+  if (eob == 0) {
+    // single eob token
+    cost = token_costs[0][0][pt][DCT_EOB_TOKEN];
+    c = 0;
+  } else {
+    int v, prev_t, band = 1, band_left = band_count[1];
+
+    // dc token
+    v = qcoeff_ptr[0];
+    prev_t = vp9_dct_value_tokens_ptr[v].token;
+    cost = token_costs[0][0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
+    token_cache[0] = vp9_pt_energy_class[prev_t];
+
+    // ac tokens
+    for (c = 1; c < eob; c++) {
+      const int rc = scan[c];
+      int t;
+
+      v = qcoeff_ptr[rc];
+      t = vp9_dct_value_tokens_ptr[v].token;
+      pt = get_coef_context(nb, token_cache, c);
+      cost += token_costs[!prev_t][band][pt][t] + vp9_dct_value_cost_ptr[v];
+      token_cache[rc] = vp9_pt_energy_class[t];
+      prev_t = t;
+      if (!--band_left) {
+        band_left = band_count[++band];
+      }
+    }
+
+    // eob token
+    if (band < 6) {
+      pt = get_coef_context(nb, token_cache, c);
+      cost += token_costs[0][band][pt][DCT_EOB_TOKEN];
+    }
+  }
+
+  // is eob first coefficient;
+  *A = *L = c > 0;
+
+  return cost;
+}
+
+struct rdcost_block_args {
+  VP9_COMMON *cm;
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[16];
+  ENTROPY_CONTEXT t_left[16];
+  TX_SIZE tx_size;
+  int bw;
+  int bh;
+  int rate;
+  int64_t dist;
+  int64_t sse;
+  int64_t best_rd;
+  int skip;
+  const int16_t *scan, *nb;
+};
+
+static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                       int ss_txfrm_size, void *arg) {
+  struct rdcost_block_args* args = arg;
+  MACROBLOCK* const x = args->x;
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  int64_t this_sse;
+  int shift = args->tx_size == TX_32X32 ? 0 : 2;
+  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block, 16);
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+  args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                                &this_sse) >> shift;
+  args->sse += this_sse >> shift;
+
+  if (x->skip_encode &&
+      xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
+    // TODO(jingning): tune the model to better capture the distortion.
+    int64_t p = (pd->dequant[1] * pd->dequant[1] *
+                    (1 << ss_txfrm_size)) >> shift;
+    args->dist += p;
+    args->sse  += p;
+  }
+}
+
+static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                       int ss_txfrm_size, void *arg) {
+  struct rdcost_block_args* args = arg;
+  int x_idx, y_idx;
+  MACROBLOCKD * const xd = &args->x->e_mbd;
+
+  txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
+                           &y_idx);
+
+  args->rate += cost_coeffs(args->cm, args->x, plane, block,
+                            xd->plane[plane].plane_type, args->t_above + x_idx,
+                            args->t_left + y_idx, args->tx_size,
+                            args->scan, args->nb);
+}
+
+// FIXME(jingning): need to make the rd test of chroma components consistent
+// with that of luma component. this function should be deprecated afterwards.
+static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
+                        BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD * const xd = &x->e_mbd;
+  const int bwl = plane_block_width_log2by4(bsize, &xd->plane[plane]);
+  const int bhl = plane_block_height_log2by4(bsize, &xd->plane[plane]);
+  const int bw = 1 << bwl, bh = 1 << bhl;
+  int i;
+  struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
+    0, 0, 0, INT64_MAX, 0 };
+
   switch (tx_size) {
-    case TX_4X4: {
-      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_4x4(xd, block) : DCT_DCT;
-      above_ec = A[0] != 0;
-      left_ec = L[0] != 0;
-      seg_eob = 16;
-      scan = get_scan_4x4(tx_type);
-      band_translate = vp9_coefband_trans_4x4;
+    case TX_4X4:
+      vpx_memcpy(&args.t_above, xd->plane[plane].above_context,
+                 sizeof(ENTROPY_CONTEXT) * bw);
+      vpx_memcpy(&args.t_left, xd->plane[plane].left_context,
+                 sizeof(ENTROPY_CONTEXT) * bh);
+      args.scan = vp9_default_scan_4x4;
+      args.nb = vp9_default_scan_4x4_neighbors;
       break;
-    }
-    case TX_8X8: {
-      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 1 + b_width_log2(sb_type);
-      const int x = block & ((1 << sz) - 1), y = block - x;
-      TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
-      above_ec = (A[0] + A[1]) != 0;
-      left_ec = (L[0] + L[1]) != 0;
-      scan = get_scan_8x8(tx_type);
-      seg_eob = 64;
-      band_translate = vp9_coefband_trans_8x8plus;
+    case TX_8X8:
+      for (i = 0; i < bw; i += 2)
+        args.t_above[i] = !!*(uint16_t *)&xd->plane[plane].above_context[i];
+      for (i = 0; i < bh; i += 2)
+        args.t_left[i] = !!*(uint16_t *)&xd->plane[plane].left_context[i];
+      args.scan = vp9_default_scan_8x8;
+      args.nb = vp9_default_scan_8x8_neighbors;
       break;
-    }
-    case TX_16X16: {
-      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 2 + b_width_log2(sb_type);
-      const int x = block & ((1 << sz) - 1), y = block - x;
-      TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
-      scan = get_scan_16x16(tx_type);
-      seg_eob = 256;
-      above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
-      left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
-      band_translate = vp9_coefband_trans_8x8plus;
+    case TX_16X16:
+      for (i = 0; i < bw; i += 4)
+        args.t_above[i] = !!*(uint32_t *)&xd->plane[plane].above_context[i];
+      for (i = 0; i < bh; i += 4)
+        args.t_left[i] = !!*(uint32_t *)&xd->plane[plane].left_context[i];
+      args.scan = vp9_default_scan_16x16;
+      args.nb = vp9_default_scan_16x16_neighbors;
       break;
-    }
     case TX_32X32:
-      scan = vp9_default_scan_32x32;
-      seg_eob = 1024;
-      above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
-      left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
-      band_translate = vp9_coefband_trans_8x8plus;
+      for (i = 0; i < bw; i += 8)
+        args.t_above[i] = !!*(uint64_t *)&xd->plane[plane].above_context[i];
+      for (i = 0; i < bh; i += 8)
+        args.t_left[i] = !!*(uint64_t *)&xd->plane[plane].left_context[i];
+      args.scan = vp9_default_scan_32x32;
+      args.nb = vp9_default_scan_32x32_neighbors;
       break;
     default:
-      abort();
-      break;
+      assert(0);
   }
-  assert(eob <= seg_eob);
 
-  pt = combine_entropy_contexts(above_ec, left_ec);
-  nb = vp9_get_coef_neighbors_handle(scan, &pad);
-  default_eob = seg_eob;
-
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
-    seg_eob = 0;
-
-  /* sanity check to ensure that we do not have spurious non-zero q values */
-  if (eob < seg_eob)
-    assert(qcoeff_ptr[scan[eob]] == 0);
-
-  {
-    for (c = 0; c < eob; c++) {
-      int v = qcoeff_ptr[scan[c]];
-      int t = vp9_dct_value_tokens_ptr[v].token;
-      int band = get_coef_band(band_translate, c);
-      if (c)
-        pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
-
-      if (!c || token_cache[scan[c - 1]])  // do not skip eob
-        cost += token_costs_noskip[band][pt][t] + vp9_dct_value_cost_ptr[v];
-      else
-        cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];
-      token_cache[scan[c]] = vp9_pt_energy_class[t];
-    }
-    if (c < seg_eob) {
-      if (c)
-        pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
-      cost += mb->token_costs_noskip[tx_size][type][ref]
-          [get_coef_band(band_translate, c)]
-          [pt][DCT_EOB_TOKEN];
-    }
+  foreach_transformed_block_in_plane(xd, bsize, plane, rate_block, &args);
+  return args.rate;
+}
+
+static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
+                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+  int cost = 0, plane;
+
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    cost += rdcost_plane(cm, x, plane, bsize, tx_size);
   }
+  return cost;
+}
 
-  // is eob first coefficient;
-  for (pt = 0; pt < (1 << tx_size); pt++) {
-    A[pt] = L[pt] = c > 0;
+static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
+                           int shift, int64_t *sse) {
+  struct macroblockd_plane *p = &x->e_mbd.plane[0];
+  const int bwl = plane_block_width_log2by4(bsize, p);
+  const int bhl = plane_block_height_log2by4(bsize, p);
+  int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
+                              16 << (bwl + bhl), sse) >> shift;
+  *sse >>= shift;
+  return e;
+}
+
+static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
+                                int shift, int64_t *sse) {
+  int64_t sum = 0, this_sse;
+  int plane;
+
+  *sse = 0;
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    struct macroblockd_plane *p = &x->e_mbd.plane[plane];
+    const int bwl = plane_block_width_log2by4(bsize, p);
+    const int bhl = plane_block_height_log2by4(bsize, p);
+    sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
+                           16 << (bwl + bhl), &this_sse);
+    *sse += this_sse;
   }
+  *sse >>= shift;
+  return sum >> shift;
+}
 
-  return cost;
+static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                           int ss_txfrm_size, void *arg) {
+  struct rdcost_block_args *args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct encode_b_args encode_args = {args->cm, x, NULL};
+  int64_t rd1, rd2, rd;
+
+  if (args->skip)
+    return;
+  rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
+  rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
+  rd = MIN(rd1, rd2);
+  if (rd > args->best_rd) {
+    args->skip = 1;
+    args->rate = INT_MAX;
+    args->dist = INT64_MAX;
+    args->sse  = INT64_MAX;
+    return;
+  }
+
+  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
+    encode_block_intra(plane, block, bsize, ss_txfrm_size, &encode_args);
+  else
+    xform_quant(plane, block, bsize, ss_txfrm_size, &encode_args);
+
+  dist_block(plane, block, bsize, ss_txfrm_size, args);
+  rate_block(plane, block, bsize, ss_txfrm_size, args);
+}
+
+static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
+                                     int *rate, int64_t *distortion,
+                                     int *skippable, int64_t *sse,
+                                     int64_t ref_best_rd,
+                                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int bwl = plane_block_width_log2by4(bsize, pd);
+  const int bhl = plane_block_height_log2by4(bsize, pd);
+  const int bw = 1 << bwl, bh = 1 << bhl;
+  int i;
+  struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
+                                    0, 0, 0, ref_best_rd, 0 };
+  xd->mode_info_context->mbmi.txfm_size = tx_size;
+  switch (tx_size) {
+    case TX_4X4:
+      vpx_memcpy(&args.t_above, pd->above_context,
+                 sizeof(ENTROPY_CONTEXT) * bw);
+      vpx_memcpy(&args.t_left, pd->left_context,
+                 sizeof(ENTROPY_CONTEXT) * bh);
+      get_scan_nb_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, 0),
+                      &args.scan, &args.nb);
+      break;
+    case TX_8X8:
+      for (i = 0; i < bw; i += 2)
+        args.t_above[i] = !!*(uint16_t *)&pd->above_context[i];
+      for (i = 0; i < bh; i += 2)
+        args.t_left[i] = !!*(uint16_t *)&pd->left_context[i];
+      get_scan_nb_8x8(get_tx_type_8x8(PLANE_TYPE_Y_WITH_DC, xd),
+                      &args.scan, &args.nb);
+      break;
+    case TX_16X16:
+      for (i = 0; i < bw; i += 4)
+        args.t_above[i] = !!*(uint32_t *)&pd->above_context[i];
+      for (i = 0; i < bh; i += 4)
+        args.t_left[i] = !!*(uint32_t *)&pd->left_context[i];
+      get_scan_nb_16x16(get_tx_type_16x16(PLANE_TYPE_Y_WITH_DC, xd),
+                        &args.scan, &args.nb);
+      break;
+    case TX_32X32:
+      for (i = 0; i < bw; i += 8)
+        args.t_above[i] = !!*(uint64_t *)&pd->above_context[i];
+      for (i = 0; i < bh; i += 8)
+        args.t_left[i] = !!*(uint64_t *)&pd->left_context[i];
+      args.scan = vp9_default_scan_32x32;
+      args.nb = vp9_default_scan_32x32_neighbors;
+      break;
+    default:
+      assert(0);
+  }
+
+  foreach_transformed_block_in_plane(xd, bsize, 0, block_yrd_txfm, &args);
+  *distortion = args.dist;
+  *rate       = args.rate;
+  *sse        = args.sse;
+  *skippable  = vp9_sby_is_skippable(xd, bsize) && (!args.skip);
+}
+
+static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
+                                     int *rate, int64_t *distortion,
+                                     int *skip, int64_t *sse,
+                                     int64_t ref_best_rd,
+                                     BLOCK_SIZE_TYPE bs) {
+  const TX_SIZE max_txfm_size = TX_32X32
+      - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  if (max_txfm_size == TX_32X32 &&
+      (cm->tx_mode == ALLOW_32X32 ||
+       cm->tx_mode == TX_MODE_SELECT)) {
+    mbmi->txfm_size = TX_32X32;
+  } else if (max_txfm_size >= TX_16X16 &&
+             (cm->tx_mode == ALLOW_16X16 ||
+              cm->tx_mode == ALLOW_32X32 ||
+              cm->tx_mode == TX_MODE_SELECT)) {
+    mbmi->txfm_size = TX_16X16;
+  } else if (cm->tx_mode != ONLY_4X4) {
+    mbmi->txfm_size = TX_8X8;
+  } else {
+    mbmi->txfm_size = TX_4X4;
+  }
+  super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
+                           &sse[mbmi->txfm_size], ref_best_rd, bs,
+                           mbmi->txfm_size);
+  cpi->txfm_stepdown_count[0]++;
 }
 
 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
                                      int (*r)[2], int *rate,
-                                     int *d, int *distortion,
+                                     int64_t *d, int64_t *distortion,
                                      int *s, int *skip,
                                      int64_t txfm_cache[NB_TXFM_MODES],
-                                     TX_SIZE max_txfm_size) {
+                                     BLOCK_SIZE_TYPE bs) {
+  const TX_SIZE max_txfm_size = TX_32X32
+      - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  vp9_prob skip_prob = vp9_get_pred_prob(cm, xd, PRED_MBSKIP);
+  vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
   int64_t rd[TX_SIZE_MAX_SB][2];
   int n, m;
   int s0, s1;
 
-  const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);
+  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs);
 
   for (n = TX_4X4; n <= max_txfm_size; n++) {
     r[n][1] = r[n][0];
+    if (r[n][0] == INT_MAX)
+      continue;
     for (m = 0; m <= n - (n == max_txfm_size); m++) {
       if (m == n)
         r[n][1] += vp9_cost_zero(tx_probs[m]);
@@ -446,6 +890,10 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   s1 = vp9_cost_bit(skip_prob, 1);
 
   for (n = TX_4X4; n <= max_txfm_size; n++) {
+    if (d[n] == INT64_MAX) {
+      rd[n][0] = rd[n][1] = INT64_MAX;
+      continue;
+    }
     if (s[n]) {
       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
     } else {
@@ -455,29 +903,29 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (max_txfm_size == TX_32X32 &&
-      (cm->txfm_mode == ALLOW_32X32 ||
-       (cm->txfm_mode == TX_MODE_SELECT &&
+      (cm->tx_mode == ALLOW_32X32 ||
+       (cm->tx_mode == TX_MODE_SELECT &&
         rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
         rd[TX_32X32][1] < rd[TX_4X4][1]))) {
     mbmi->txfm_size = TX_32X32;
   } else if (max_txfm_size >= TX_16X16 &&
-             (cm->txfm_mode == ALLOW_16X16 ||
-              cm->txfm_mode == ALLOW_32X32 ||
-              (cm->txfm_mode == TX_MODE_SELECT &&
+             (cm->tx_mode == ALLOW_16X16 ||
+              cm->tx_mode == ALLOW_32X32 ||
+              (cm->tx_mode == TX_MODE_SELECT &&
                rd[TX_16X16][1] < rd[TX_8X8][1] &&
                rd[TX_16X16][1] < rd[TX_4X4][1]))) {
     mbmi->txfm_size = TX_16X16;
-  } else if (cm->txfm_mode == ALLOW_8X8 ||
-             cm->txfm_mode == ALLOW_16X16 ||
-             cm->txfm_mode == ALLOW_32X32 ||
-           (cm->txfm_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
+  } else if (cm->tx_mode == ALLOW_8X8 ||
+             cm->tx_mode == ALLOW_16X16 ||
+             cm->tx_mode == ALLOW_32X32 ||
+           (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
     mbmi->txfm_size = TX_8X8;
   } else {
     mbmi->txfm_size = TX_4X4;
   }
 
   *distortion = d[mbmi->txfm_size];
-  *rate       = r[mbmi->txfm_size][cm->txfm_mode == TX_MODE_SELECT];
+  *rate       = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT];
   *skip       = s[mbmi->txfm_size];
 
   txfm_cache[ONLY_4X4] = rd[TX_4X4][0];
@@ -494,119 +942,134 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   else
     txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
                                  rd[TX_4X4][1] : rd[TX_8X8][1];
-}
 
-static int block_error(int16_t *coeff, int16_t *dqcoeff,
-                       int block_size, int shift) {
-  int i;
-  int64_t error = 0;
-
-  for (i = 0; i < block_size; i++) {
-    int this_diff = coeff[i] - dqcoeff[i];
-    error += (unsigned)this_diff * this_diff;
-  }
-  error >>= shift;
-
-  return error > INT_MAX ? INT_MAX : (int)error;
-}
-
-static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
-                     16 << (bwl + bhl), shift);
-}
-
-static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  int64_t sum = 0;
-  int plane;
-
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    const int subsampling = x->e_mbd.plane[plane].subsampling_x +
-                            x->e_mbd.plane[plane].subsampling_y;
-    sum += block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
-                       16 << (bwl + bhl - subsampling), 0);
+  if (max_txfm_size == TX_32X32 &&
+      rd[TX_32X32][1] < rd[TX_16X16][1] &&
+      rd[TX_32X32][1] < rd[TX_8X8][1] &&
+      rd[TX_32X32][1] < rd[TX_4X4][1]) {
+    cpi->txfm_stepdown_count[0]++;
+  } else if (max_txfm_size >= TX_16X16 &&
+             rd[TX_16X16][1] < rd[TX_8X8][1] &&
+             rd[TX_16X16][1] < rd[TX_4X4][1]) {
+    cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
+  } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
+    cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
+  } else {
+    cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
   }
-  sum >>= shift;
-  return sum > INT_MAX ? INT_MAX : (int)sum;
-}
-
-struct rdcost_block_args {
-  VP9_COMMON *cm;
-  MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[16];
-  ENTROPY_CONTEXT t_left[16];
-  TX_SIZE tx_size;
-  int bw;
-  int bh;
-  int cost;
-};
-
-static void rdcost_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                         int ss_txfrm_size, void *arg) {
-  struct rdcost_block_args* args = arg;
-  int x_idx, y_idx;
-  MACROBLOCKD * const xd = &args->x->e_mbd;
-
-  txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
-                           &y_idx);
-
-  args->cost += cost_coeffs(args->cm, args->x, plane, block,
-                            xd->plane[plane].plane_type, args->t_above + x_idx,
-                            args->t_left + y_idx, args->tx_size,
-                            args->bw * args->bh);
 }
 
-static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
-                        BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
-  MACROBLOCKD * const xd = &x->e_mbd;
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-  const int bw = 1 << bwl, bh = 1 << bhl;
-  struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, 0 };
+static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
+                                          int (*r)[2], int *rate,
+                                          int64_t *d, int64_t *distortion,
+                                          int *s, int *skip, int64_t *sse,
+                                          int64_t ref_best_rd,
+                                          BLOCK_SIZE_TYPE bs,
+                                          int *model_used) {
+  const TX_SIZE max_txfm_size = TX_32X32
+      - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
+  int64_t rd[TX_SIZE_MAX_SB][2];
+  int n, m;
+  int s0, s1;
+  double scale_rd[TX_SIZE_MAX_SB] = {1.73, 1.44, 1.20, 1.00};
+  // double scale_r[TX_SIZE_MAX_SB] = {2.82, 2.00, 1.41, 1.00};
 
-  vpx_memcpy(&args.t_above, xd->plane[plane].above_context,
-             sizeof(ENTROPY_CONTEXT) * bw);
-  vpx_memcpy(&args.t_left, xd->plane[plane].left_context,
-             sizeof(ENTROPY_CONTEXT) * bh);
+  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs);
 
-  foreach_transformed_block_in_plane(xd, bsize, plane, rdcost_block, &args);
+  // for (n = TX_4X4; n <= max_txfm_size; n++)
+  //   r[n][0] = (r[n][0] * scale_r[n]);
 
-  return args.cost;
-}
+  for (n = TX_4X4; n <= max_txfm_size; n++) {
+    r[n][1] = r[n][0];
+    for (m = 0; m <= n - (n == max_txfm_size); m++) {
+      if (m == n)
+        r[n][1] += vp9_cost_zero(tx_probs[m]);
+      else
+        r[n][1] += vp9_cost_one(tx_probs[m]);
+    }
+  }
 
-static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
-                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
-  int cost = 0, plane;
+  assert(skip_prob > 0);
+  s0 = vp9_cost_bit(skip_prob, 0);
+  s1 = vp9_cost_bit(skip_prob, 1);
 
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    cost += rdcost_plane(cm, x, plane, bsize, tx_size);
+  for (n = TX_4X4; n <= max_txfm_size; n++) {
+    if (s[n]) {
+      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+    } else {
+      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
+      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
+    }
+  }
+  for (n = TX_4X4; n <= max_txfm_size; n++) {
+    rd[n][0] = (scale_rd[n] * rd[n][0]);
+    rd[n][1] = (scale_rd[n] * rd[n][1]);
   }
-  return cost;
-}
 
-static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
-                                     int *rate, int *distortion, int *skippable,
-                                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  xd->mode_info_context->mbmi.txfm_size = tx_size;
+  if (max_txfm_size == TX_32X32 &&
+      (cm->tx_mode == ALLOW_32X32 ||
+       (cm->tx_mode == TX_MODE_SELECT &&
+        rd[TX_32X32][1] <= rd[TX_16X16][1] &&
+        rd[TX_32X32][1] <= rd[TX_8X8][1] &&
+        rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
+    mbmi->txfm_size = TX_32X32;
+  } else if (max_txfm_size >= TX_16X16 &&
+             (cm->tx_mode == ALLOW_16X16 ||
+              cm->tx_mode == ALLOW_32X32 ||
+              (cm->tx_mode == TX_MODE_SELECT &&
+               rd[TX_16X16][1] <= rd[TX_8X8][1] &&
+               rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
+    mbmi->txfm_size = TX_16X16;
+  } else if (cm->tx_mode == ALLOW_8X8 ||
+             cm->tx_mode == ALLOW_16X16 ||
+             cm->tx_mode == ALLOW_32X32 ||
+           (cm->tx_mode == TX_MODE_SELECT &&
+            rd[TX_8X8][1] <= rd[TX_4X4][1])) {
+    mbmi->txfm_size = TX_8X8;
+  } else {
+    mbmi->txfm_size = TX_4X4;
+  }
 
-  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
-    vp9_encode_intra_block_y(cm, x, bsize);
-  else
-    vp9_xform_quant_sby(cm, x, bsize);
+  if (model_used[mbmi->txfm_size]) {
+    // Actually encode using the chosen mode if a model was used, but do not
+    // update the r, d costs
+    super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
+                             &sse[mbmi->txfm_size], ref_best_rd,
+                             bs, mbmi->txfm_size);
+  } else {
+    *distortion = d[mbmi->txfm_size];
+    *rate       = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT];
+    *skip       = s[mbmi->txfm_size];
+  }
 
-  *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2);
-  *rate       = rdcost_plane(cm, x, 0, bsize, tx_size);
-  *skippable  = vp9_sby_is_skippable(xd, bsize);
+  if (max_txfm_size == TX_32X32 &&
+      rd[TX_32X32][1] <= rd[TX_16X16][1] &&
+      rd[TX_32X32][1] <= rd[TX_8X8][1] &&
+      rd[TX_32X32][1] <= rd[TX_4X4][1]) {
+    cpi->txfm_stepdown_count[0]++;
+  } else if (max_txfm_size >= TX_16X16 &&
+             rd[TX_16X16][1] <= rd[TX_8X8][1] &&
+             rd[TX_16X16][1] <= rd[TX_4X4][1]) {
+    cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
+  } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
+    cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
+  } else {
+    cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
+  }
 }
 
 static void super_block_yrd(VP9_COMP *cpi,
-                            MACROBLOCK *x, int *rate, int *distortion,
-                            int *skip, BLOCK_SIZE_TYPE bs,
-                            int64_t txfm_cache[NB_TXFM_MODES]) {
+                            MACROBLOCK *x, int *rate, int64_t *distortion,
+                            int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,
+                            int64_t txfm_cache[NB_TXFM_MODES],
+                            int64_t ref_best_rd) {
   VP9_COMMON *const cm = &cpi->common;
-  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
+  int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB];
+  int64_t d[TX_SIZE_MAX_SB], sse[TX_SIZE_MAX_SB];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
@@ -614,36 +1077,95 @@ static void super_block_yrd(VP9_COMP *cpi,
   if (mbmi->ref_frame[0] > INTRA_FRAME)
     vp9_subtract_sby(x, bs);
 
-  if (cpi->speed > 4) {
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
+      (cpi->sf.tx_size_search_method != USE_FULL_RD &&
+       mbmi->ref_frame[0] == INTRA_FRAME)) {
+    vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));
+    choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
+                             ref_best_rd, bs);
+    if (psse)
+      *psse = sse[mbmi->txfm_size];
+    return;
+  }
+
+  if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
+      mbmi->ref_frame[0] > INTRA_FRAME) {
+    int model_used[TX_SIZE_MAX_SB] = {1, 1, 1, 1};
     if (bs >= BLOCK_SIZE_SB32X32) {
-      mbmi->txfm_size = TX_32X32;
-    } else if (bs >= BLOCK_SIZE_MB16X16) {
-      mbmi->txfm_size = TX_16X16;
-    } else if (bs >= BLOCK_SIZE_SB8X8) {
-      mbmi->txfm_size = TX_8X8;
+      if (model_used[TX_32X32]) {
+        model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
+                             &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
+      } else {
+        super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
+                                 &s[TX_32X32], &sse[TX_32X32], INT64_MAX,
+                                 bs, TX_32X32);
+      }
+    }
+    if (bs >= BLOCK_SIZE_MB16X16) {
+      if (model_used[TX_16X16]) {
+        model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
+                             &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
+      } else {
+        super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
+                                 &s[TX_16X16], &sse[TX_16X16], INT64_MAX,
+                                 bs, TX_16X16);
+      }
+    }
+    if (model_used[TX_8X8]) {
+      model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
+                           &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
     } else {
-      mbmi->txfm_size = TX_4X4;
+      super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+                               &sse[TX_8X8], INT64_MAX, bs, TX_8X8);
     }
-    vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));
-    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs,
-                             mbmi->txfm_size);
-    return;
+    if (model_used[TX_4X4]) {
+      model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
+                           &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
+    } else {
+      super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+                               &sse[TX_4X4], INT64_MAX, bs, TX_4X4);
+    }
+    choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
+                                  skip, sse, ref_best_rd, bs, model_used);
+  } else {
+    if (bs >= BLOCK_SIZE_SB32X32)
+      super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
+                               &s[TX_32X32], &sse[TX_32X32], ref_best_rd,
+                               bs, TX_32X32);
+    if (bs >= BLOCK_SIZE_MB16X16)
+      super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
+                               &s[TX_16X16], &sse[TX_16X16], ref_best_rd,
+                               bs, TX_16X16);
+    super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+                             &sse[TX_8X8], ref_best_rd, bs, TX_8X8);
+    super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+                             &sse[TX_4X4], ref_best_rd, bs, TX_4X4);
+    choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
+                             skip, txfm_cache, bs);
   }
-  if (bs >= BLOCK_SIZE_SB32X32)
-    super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
-                             bs, TX_32X32);
-  if (bs >= BLOCK_SIZE_MB16X16)
-    super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
-                             bs, TX_16X16);
-  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs,
-                           TX_8X8);
-  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs,
-                           TX_4X4);
-
-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
-                           skip, txfm_cache,
-                           TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
-                           - (bs < BLOCK_SIZE_MB16X16));
+  if (psse)
+    *psse = sse[mbmi->txfm_size];
+}
+
+static int conditional_skipintra(MB_PREDICTION_MODE mode,
+                                 MB_PREDICTION_MODE best_intra_mode) {
+  if (mode == D117_PRED &&
+      best_intra_mode != V_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  if (mode == D63_PRED &&
+      best_intra_mode != V_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D27_PRED &&
+      best_intra_mode != H_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D153_PRED &&
+      best_intra_mode != H_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  return 0;
 }
 
 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
@@ -651,15 +1173,18 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                      int *bmode_costs,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
-                                     int *bestdistortion,
+                                     int64_t *bestdistortion,
                                      BLOCK_SIZE_TYPE bsize) {
   MB_PREDICTION_MODE mode;
   MACROBLOCKD *xd = &x->e_mbd;
   int64_t best_rd = INT64_MAX;
   int rate = 0;
-  int distortion;
+  int64_t distortion;
   VP9_COMMON *const cm = &cpi->common;
-  const int src_stride = x->plane[0].src.stride;
+  struct macroblock_plane *p = &x->plane[0];
+  struct macroblockd_plane *pd = &xd->plane[0];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
   uint8_t *src, *dst;
   int16_t *src_diff, *coeff;
 
@@ -667,8 +1192,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   ENTROPY_CONTEXT tl[2], templ[2];
   TX_TYPE tx_type = DCT_DCT;
   TX_TYPE best_tx_type = DCT_DCT;
-  int bw = 1 << b_width_log2(bsize);
-  int bh = 1 << b_height_log2(bsize);
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy, block;
   DECLARE_ALIGNED(16, int16_t, best_dqcoeff[4][16]);
 
@@ -681,6 +1206,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
     int ratey = 0;
+    // Only do the oblique modes if the best so far is
+    // one of the neighboring directional modes
+    if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+      if (conditional_skipintra(mode, *best_mode))
+          continue;
+    }
 
     rate = bmode_costs[mode];
     distortion = 0;
@@ -688,25 +1219,30 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
     vpx_memcpy(tempa, ta, sizeof(ta));
     vpx_memcpy(templ, tl, sizeof(tl));
 
-    for (idy = 0; idy < bh; ++idy) {
-      for (idx = 0; idx < bw; ++idx) {
+    for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+      for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
+        int64_t ssz;
+        const int16_t *scan;
+
         block = ib + idy * 2 + idx;
-        xd->mode_info_context->bmi[block].as_mode.first = mode;
+        xd->mode_info_context->bmi[block].as_mode = mode;
         src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                        x->plane[0].src.buf, src_stride);
+                                        p->src.buf, src_stride);
         src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                             x->plane[0].src_diff);
+                                             p->src_diff);
         coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
         dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                        xd->plane[0].dst.buf,
-                                        xd->plane[0].dst.stride);
-        vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, mode,
-                             dst, xd->plane[0].dst.stride);
+                                        pd->dst.buf, dst_stride);
+        vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8),
+                                TX_4X4, mode,
+                                x->skip_encode ? src : dst,
+                                x->skip_encode ? src_stride : dst_stride,
+                                dst, dst_stride);
         vp9_subtract_block(4, 4, src_diff, 8,
                            src, src_stride,
-                           dst, xd->plane[0].dst.stride);
+                           dst, dst_stride);
 
-        tx_type = get_tx_type_4x4(xd, block);
+        tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
         if (tx_type != DCT_DCT) {
           vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
           x->quantize_b_4x4(x, block, tx_type, 16);
@@ -715,17 +1251,20 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
           x->quantize_b_4x4(x, block, tx_type, 16);
         }
 
+        scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block));
         ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,
-                             tempa + idx, templ + idy, TX_4X4, 16);
-        distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff,
-                                                         block, 16), 16) >> 2;
-
-        if (best_tx_type != DCT_DCT)
-          vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
-                               dst, xd->plane[0].dst.stride, best_tx_type);
+                             tempa + idx, templ + idy, TX_4X4, scan,
+                             vp9_get_coef_neighbors_handle(scan));
+        distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,
+                                                          block, 16),
+                                      16, &ssz) >> 2;
+
+        if (tx_type != DCT_DCT)
+          vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+                               dst, pd->dst.stride, tx_type);
         else
-          xd->inv_txm4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
-                             dst, xd->plane[0].dst.stride);
+          xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+                             dst, pd->dst.stride);
       }
     }
 
@@ -741,34 +1280,41 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       best_tx_type = tx_type;
       vpx_memcpy(a, tempa, sizeof(tempa));
       vpx_memcpy(l, templ, sizeof(templ));
-      for (idy = 0; idy < bh; ++idy) {
-        for (idx = 0; idx < bw; ++idx) {
+      for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+        for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
           block = ib + idy * 2 + idx;
           vpx_memcpy(best_dqcoeff[idy * 2 + idx],
-                     BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+                     BLOCK_OFFSET(pd->dqcoeff, block, 16),
                      sizeof(best_dqcoeff[0]));
         }
       }
     }
   }
 
-  for (idy = 0; idy < bh; ++idy) {
-    for (idx = 0; idx < bw; ++idx) {
+  if (x->skip_encode)
+    return best_rd;
+
+  for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+    for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
       block = ib + idy * 2 + idx;
-      xd->mode_info_context->bmi[block].as_mode.first = *best_mode;
+      xd->mode_info_context->bmi[block].as_mode = *best_mode;
+      src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                      p->src.buf, src_stride);
       dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                      xd->plane[0].dst.buf,
-                                      xd->plane[0].dst.stride);
+                                      pd->dst.buf, dst_stride);
 
-      vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, *best_mode,
-                           dst, xd->plane[0].dst.stride);
+      vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8), TX_4X4,
+                              *best_mode,
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride);
       // inverse transform
       if (best_tx_type != DCT_DCT)
         vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst,
-                             xd->plane[0].dst.stride, best_tx_type);
+                             dst_stride, best_tx_type);
       else
         xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst,
-                           xd->plane[0].dst.stride);
+                           dst_stride);
     }
   }
 
@@ -777,15 +1323,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 
 static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
                                          int *Rate, int *rate_y,
-                                         int *Distortion, int64_t best_rd) {
+                                         int64_t *Distortion, int64_t best_rd) {
   int i, j;
   MACROBLOCKD *const xd = &mb->e_mbd;
   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
-  int bw = 1 << b_width_log2(bsize);
-  int bh = 1 << b_height_log2(bsize);
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
   int cost = 0;
-  int distortion = 0;
+  int64_t distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
   ENTROPY_CONTEXT t_above[4], t_left[4];
@@ -797,15 +1343,15 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
 
   bmode_costs = mb->mbmode_cost;
 
-  for (idy = 0; idy < 2; idy += bh) {
-    for (idx = 0; idx < 2; idx += bw) {
+  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
       const int mis = xd->mode_info_stride;
       MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
       int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
-      int UNINITIALIZED_IS_SAFE(d);
+      int64_t UNINITIALIZED_IS_SAFE(d);
       i = idy * 2 + idx;
 
-      if (xd->frame_type == KEY_FRAME) {
+      if (cpi->common.frame_type == KEY_FRAME) {
         const MB_PREDICTION_MODE A = above_block_mode(mic, i, mis);
         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
                                      left_block_mode(mic, i) : DC_PRED;
@@ -820,51 +1366,45 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
       distortion += d;
       tot_rate_y += ry;
 
-      mic->bmi[i].as_mode.first = best_mode;
-      for (j = 1; j < bh; ++j)
-        mic->bmi[i + j * 2].as_mode.first = best_mode;
-      for (j = 1; j < bw; ++j)
-        mic->bmi[i + j].as_mode.first = best_mode;
+      mic->bmi[i].as_mode = best_mode;
+      for (j = 1; j < num_4x4_blocks_high; ++j)
+        mic->bmi[i + j * 2].as_mode = best_mode;
+      for (j = 1; j < num_4x4_blocks_wide; ++j)
+        mic->bmi[i + j].as_mode = best_mode;
 
       if (total_rd >= best_rd)
-        break;
+        return INT64_MAX;
     }
   }
 
-  if (total_rd >= best_rd)
-    return INT64_MAX;
-
   *Rate = cost;
   *rate_y = tot_rate_y;
   *Distortion = distortion;
-  xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode.first;
+  xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode;
 
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
 
 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                       int *rate, int *rate_tokenonly,
-                                      int *distortion, int *skippable,
+                                      int64_t *distortion, int *skippable,
                                       BLOCK_SIZE_TYPE bsize,
-                                      int64_t txfm_cache[NB_TXFM_MODES]) {
+                                      int64_t txfm_cache[NB_TXFM_MODES],
+                                      int64_t best_rd) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
   MACROBLOCKD *const xd = &x->e_mbd;
-  int this_rate, this_rate_tokenonly;
-  int this_distortion, s;
-  int64_t best_rd = INT64_MAX, this_rd;
+  int this_rate, this_rate_tokenonly, s;
+  int64_t this_distortion, this_rd;
   TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
   int i;
   int *bmode_costs = x->mbmode_cost;
 
-  if (bsize < BLOCK_SIZE_SB8X8) {
-    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
-    return best_rd;
+  if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+    for (i = 0; i < NB_TXFM_MODES; i++)
+      txfm_cache[i] = INT64_MAX;
   }
 
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    txfm_cache[i] = INT64_MAX;
-
   /* Y Search for 32x32 intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int64_t local_txfm_cache[NB_TXFM_MODES];
@@ -880,8 +1420,11 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
     x->e_mbd.mode_info_context->mbmi.mode = mode;
 
-    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
-                    bsize, local_txfm_cache);
+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
+                    bsize, local_txfm_cache, best_rd);
+
+    if (this_rate_tokenonly == INT_MAX)
+      continue;
 
     this_rate = this_rate_tokenonly + bmode_costs[mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
@@ -896,11 +1439,13 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
       *skippable      = s;
     }
 
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      int64_t adj_rd = this_rd + local_txfm_cache[i] -
-                       local_txfm_cache[cpi->common.txfm_mode];
-      if (adj_rd < txfm_cache[i]) {
-        txfm_cache[i] = adj_rd;
+    if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        int64_t adj_rd = this_rd + local_txfm_cache[i] -
+            local_txfm_cache[cpi->common.tx_mode];
+        if (adj_rd < txfm_cache[i]) {
+          txfm_cache[i] = adj_rd;
+        }
       }
     }
   }
@@ -912,60 +1457,56 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
 }
 
 static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
-                                      int *rate, int *distortion,
-                                      int *skippable, BLOCK_SIZE_TYPE bsize,
+                                      int *rate, int64_t *distortion,
+                                      int *skippable, int64_t *sse,
+                                      BLOCK_SIZE_TYPE bsize,
                                       TX_SIZE uv_tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  int64_t dummy;
   if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
     vp9_encode_intra_block_uv(cm, x, bsize);
   else
     vp9_xform_quant_sbuv(cm, x, bsize);
 
-  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2);
+  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2,
+                                 sse ? sse : &dummy);
   *rate       = rdcost_uv(cm, x, bsize, uv_tx_size);
   *skippable  = vp9_sbuv_is_skippable(xd, bsize);
 }
 
 static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
-                             int *rate, int *distortion, int *skippable,
-                             BLOCK_SIZE_TYPE bsize) {
+                             int *rate, int64_t *distortion, int *skippable,
+                             int64_t *sse, BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
 
   if (mbmi->ref_frame[0] > INTRA_FRAME)
     vp9_subtract_sbuv(x, bsize);
 
-  if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) {
-    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
-                              TX_32X32);
-  } else if (mbmi->txfm_size >= TX_16X16 && bsize >= BLOCK_SIZE_SB32X32) {
-    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
-                              TX_16X16);
-  } else if (mbmi->txfm_size >= TX_8X8 && bsize >= BLOCK_SIZE_MB16X16) {
-    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
-                              TX_8X8);
-  } else {
-    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
-                              TX_4X4);
-  }
+  super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, sse, bsize,
+                            uv_txfm_size);
 }
 
 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                        int *rate, int *rate_tokenonly,
-                                       int *distortion, int *skippable,
+                                       int64_t *distortion, int *skippable,
                                        BLOCK_SIZE_TYPE bsize) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
   int64_t best_rd = INT64_MAX, this_rd;
-  int this_rate_tokenonly, this_rate;
-  int this_distortion, s;
+  int this_rate_tokenonly, this_rate, s;
+  int64_t this_distortion;
 
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+  MB_PREDICTION_MODE last_mode = bsize <= BLOCK_SIZE_SB8X8 ?
+              TM_PRED : cpi->sf.last_chroma_intra_mode;
+
+  for (mode = DC_PRED; mode <= last_mode; mode++) {
     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
     super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
-                     &this_distortion, &s, bsize);
+                     &this_distortion, &s, NULL, bsize);
     this_rate = this_rate_tokenonly +
-                x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
+                x->intra_uv_mode_cost[cpi->common.frame_type][mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
@@ -983,21 +1524,60 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
   return best_rd;
 }
 
-int vp9_cost_mv_ref(VP9_COMP *cpi,
-                    MB_PREDICTION_MODE m,
-                    const int mode_context) {
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-
-  // Dont account for mode here if segment skip is enabled.
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
-    VP9_COMMON *pc = &cpi->common;
-    assert(NEARESTMV <= m  &&  m <= NEWMV);
-    return cost_token(vp9_sb_mv_ref_tree,
-                      pc->fc.inter_mode_probs[mode_context],
-                      vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
-  } else
+static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
+                              int *rate, int *rate_tokenonly,
+                              int64_t *distortion, int *skippable,
+                              BLOCK_SIZE_TYPE bsize) {
+  int64_t this_rd;
+
+  x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+  super_block_uvrd(&cpi->common, x, rate_tokenonly,
+                   distortion, skippable, NULL, bsize);
+  *rate = *rate_tokenonly +
+          x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
+  this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+
+  x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+
+  return this_rd;
+}
+
+static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+                                 int *rate_uv, int *rate_uv_tokenonly,
+                                 int64_t *dist_uv, int *skip_uv,
+                                 MB_PREDICTION_MODE *mode_uv) {
+  MACROBLOCK *const x = &cpi->mb;
+
+  // Use an estimated rd for uv_intra based on DC_PRED if the
+  // appropriate speed flag is set.
+  if (cpi->sf.use_uv_intra_rd_estimate) {
+    rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+                   (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
+                   bsize);
+  // Else do a proper rd search for each possible transform size that may
+  // be considered in the main rd loop.
+  } else {
+    rd_pick_intra_sbuv_mode(cpi, x,
+                            rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+                            (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8
+                            : bsize);
+  }
+  *mode_uv = x->e_mbd.mode_info_context->mbmi.uv_mode;
+}
+
+static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
+                       int mode_context) {
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+
+  // Don't account for mode here if segment skip is enabled.
+  if (!vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) {
+    assert(is_inter_mode(mode));
+    return x->inter_mode_cost[mode_context][mode - NEARESTMV];
+  } else {
     return 0;
+  }
 }
 
 void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
@@ -1029,8 +1609,8 @@ static int labels2mode(MACROBLOCK *x, int i,
   MB_MODE_INFO * mbmi = &mic->mbmi;
   int cost = 0, thismvcost = 0;
   int idx, idy;
-  int bw = 1 << b_width_log2(mbmi->sb_type);
-  int bh = 1 << b_height_log2(mbmi->sb_type);
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
 
   /* We have to be careful retrieving previously-encoded motion vectors.
    Ones from this macroblock have to be pulled from the BLOCKD array
@@ -1072,77 +1652,63 @@ static int labels2mode(MACROBLOCK *x, int i,
       break;
   }
 
-  cost = vp9_cost_mv_ref(cpi, this_mode,
-                         mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+  cost = cost_mv_ref(cpi, this_mode,
+                     mbmi->mb_mode_context[mbmi->ref_frame[0]]);
 
   mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
   if (mbmi->ref_frame[1] > 0)
     mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
 
   x->partition_info->bmi[i].mode = m;
-  x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
-  if (mbmi->ref_frame[1] > 0)
-    x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
-  for (idy = 0; idy < bh; ++idy) {
-    for (idx = 0; idx < bw; ++idx) {
+  for (idy = 0; idy < num_4x4_blocks_high; ++idy)
+    for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
                  &mic->bmi[i], sizeof(mic->bmi[i]));
-      vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx],
-                 &x->partition_info->bmi[i],
-                 sizeof(x->partition_info->bmi[i]));
-    }
-  }
 
   cost += thismvcost;
   return cost;
 }
 
-static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
+static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                                        MACROBLOCK *x,
+                                       int64_t best_yrd,
                                        int i,
                                        int *labelyrate,
-                                       int *distortion,
+                                       int64_t *distortion, int64_t *sse,
                                        ENTROPY_CONTEXT *ta,
                                        ENTROPY_CONTEXT *tl) {
   int k;
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
-  int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  const int width = plane_block_width(bsize, &xd->plane[0]);
+  const int height = plane_block_height(bsize, &xd->plane[0]);
   int idx, idy;
   const int src_stride = x->plane[0].src.stride;
-  uint8_t* const src =
-  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                            x->plane[0].src.buf, src_stride);
-  int16_t* src_diff =
-  raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
-                            x->plane[0].src_diff);
+  uint8_t* const src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                                 x->plane[0].src.buf,
+                                                 src_stride);
+  int16_t* src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                                x->plane[0].src_diff);
   int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
-  uint8_t* const pre =
-  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                            xd->plane[0].pre[0].buf,
-                            xd->plane[0].pre[0].stride);
-  uint8_t* const dst =
-  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                            xd->plane[0].dst.buf,
-                            xd->plane[0].dst.stride);
-  int thisdistortion = 0;
+  uint8_t* const pre = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                                 xd->plane[0].pre[0].buf,
+                                                 xd->plane[0].pre[0].stride);
+  uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                                 xd->plane[0].dst.buf,
+                                                 xd->plane[0].dst.stride);
+  int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0;
 
-  *labelyrate = 0;
-  *distortion = 0;
-
   vp9_build_inter_predictor(pre,
                             xd->plane[0].pre[0].stride,
                             dst,
                             xd->plane[0].dst.stride,
                             &xd->mode_info_context->bmi[i].as_mv[0],
                             &xd->scale_factor[0],
-                            4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix);
+                            width, height, 0, &xd->subpix,
+                            MV_PRECISION_Q3);
 
-  // TODO(debargha): Make this work properly with the
-  // implicit-compoundinter-weight experiment when implicit
-  // weighting for splitmv modes is turned on.
   if (xd->mode_info_context->mbmi.ref_frame[1] > 0) {
     uint8_t* const second_pre =
     raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
@@ -1151,17 +1717,20 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
     vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
                               dst, xd->plane[0].dst.stride,
                               &xd->mode_info_context->bmi[i].as_mv[1],
-                              &xd->scale_factor[1], 4 * bw, 4 * bh, 1,
-                              &xd->subpix);
+                              &xd->scale_factor[1],
+                              width, height, 1,
+                              &xd->subpix, MV_PRECISION_Q3);
   }
 
-  vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,
+  vp9_subtract_block(height, width, src_diff, 8,
                      src, src_stride,
                      dst, xd->plane[0].dst.stride);
 
   k = i;
-  for (idy = 0; idy < bh; ++idy) {
-    for (idx = 0; idx < bw; ++idx) {
+  for (idy = 0; idy < height / 4; ++idy) {
+    for (idx = 0; idx < width / 4; ++idx) {
+      int64_t ssz, rd, rd1, rd2;
+
       k += (idy * 2 + idx);
       src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
                                            x->plane[0].src_diff);
@@ -1170,30 +1739,50 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
       x->quantize_b_4x4(x, k, DCT_DCT, 16);
       thisdistortion += vp9_block_error(coeff,
                                         BLOCK_OFFSET(xd->plane[0].dqcoeff,
-                                                     k, 16), 16);
+                                                     k, 16), 16, &ssz);
+      thissse += ssz;
       thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
                               ta + (k & 1),
-                              tl + (k >> 1), TX_4X4, 16);
+                              tl + (k >> 1), TX_4X4,
+                              vp9_default_scan_4x4,
+                              vp9_default_scan_4x4_neighbors);
+      rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
+      rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
+      rd = MIN(rd1, rd2);
+      if (rd >= best_yrd)
+        return INT64_MAX;
     }
   }
-  *distortion += thisdistortion;
-  *labelyrate += thisrate;
+  *distortion = thisdistortion >> 2;
+  *labelyrate = thisrate;
+  *sse = thissse >> 2;
 
-  *distortion >>= 2;
   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
 
 typedef struct {
+  int eobs;
+  int brate;
+  int byrate;
+  int64_t bdist;
+  int64_t bsse;
+  int64_t brdcost;
+  int_mv mvs[2];
+  ENTROPY_CONTEXT ta[2];
+  ENTROPY_CONTEXT tl[2];
+} SEG_RDSTAT;
+
+typedef struct {
   int_mv *ref_mv, *second_ref_mv;
   int_mv mvp;
 
   int64_t segment_rd;
   int r;
-  int d;
+  int64_t d;
+  int64_t sse;
   int segment_yrate;
   MB_PREDICTION_MODE modes[4];
-  int_mv mvs[4], second_mvs[4];
-  int eobs[4];
+  SEG_RDSTAT rdstat[4][VP9_INTER_MODES];
   int mvthresh;
 } BEST_SEG_INFO;
 
@@ -1206,50 +1795,6 @@ static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
   return r;
 }
 
-static enum BlockSize get_block_size(int bw, int bh) {
-  if (bw == 4 && bh == 4)
-    return BLOCK_4X4;
-
-  if (bw == 4 && bh == 8)
-    return BLOCK_4X8;
-
-  if (bw == 8 && bh == 4)
-    return BLOCK_8X4;
-
-  if (bw == 8 && bh == 8)
-    return BLOCK_8X8;
-
-  if (bw == 8 && bh == 16)
-    return BLOCK_8X16;
-
-  if (bw == 16 && bh == 8)
-    return BLOCK_16X8;
-
-  if (bw == 16 && bh == 16)
-    return BLOCK_16X16;
-
-  if (bw == 32 && bh == 32)
-    return BLOCK_32X32;
-
-  if (bw == 32 && bh == 16)
-    return BLOCK_32X16;
-
-  if (bw == 16 && bh == 32)
-    return BLOCK_16X32;
-
-  if (bw == 64 && bh == 32)
-    return BLOCK_64X32;
-
-  if (bw == 32 && bh == 64)
-    return BLOCK_32X64;
-
-  if (bw == 64 && bh == 64)
-    return BLOCK_64X64;
-
-  assert(0);
-  return -1;
-}
-
 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
   MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
   x->plane[0].src.buf =
@@ -1278,32 +1823,31 @@ static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
 }
 
 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
-                                    BEST_SEG_INFO *bsi,
+                                    BEST_SEG_INFO *bsi_buf, int filter_idx,
                                     int_mv seg_mvs[4][MAX_REF_FRAMES],
                                     int mi_row, int mi_col) {
-  int i, j;
-  int br = 0, bd = 0;
+  int i, j, br = 0, idx, idy;
+  int64_t bd = 0, block_sse = 0;
   MB_PREDICTION_MODE this_mode;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
   const int label_count = 4;
-  int64_t this_segment_rd = 0, other_segment_rd;
+  int64_t this_segment_rd = 0;
   int label_mv_thresh;
-  int rate = 0;
-  int sbr = 0, sbd = 0;
   int segmentyrate = 0;
-  int best_eobs[4] = { 0 };
   BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
-  int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  int bhl = b_height_log2(bsize), bh = 1 << bhl;
-  int idx, idy;
+  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   vp9_variance_fn_ptr_t *v_fn_ptr;
-  ENTROPY_CONTEXT t_above[4], t_left[4];
-  ENTROPY_CONTEXT t_above_b[4], t_left_b[4];
+  ENTROPY_CONTEXT t_above[2], t_left[2];
+  BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
+  int mode_idx;
+  int subpelmv = 1, have_ref = 0;
 
   vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
   vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
 
-  v_fn_ptr = &cpi->fn_ptr[get_block_size(4 << bwl, 4 << bhl)];
+  v_fn_ptr = &cpi->fn_ptr[bsize];
 
   // 64 makes this threshold really big effectively
   // making it so that we very rarely check mvs on
@@ -1312,17 +1856,14 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   label_mv_thresh = 1 * bsi->mvthresh / label_count;
 
   // Segmentation method overheads
-  other_segment_rd = this_segment_rd;
-
-  for (idy = 0; idy < 2; idy += bh) {
-    for (idx = 0; idx < 2; idx += bw) {
+  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
       int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT];
       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-      int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
       MB_PREDICTION_MODE mode_selected = ZEROMV;
-      int bestlabelyrate = 0;
+      int64_t best_rd = INT64_MAX;
       i = idy * 2 + idx;
 
       frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;
@@ -1339,20 +1880,58 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
       // search for the best motion vector on this segment
       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-        int64_t this_rd;
-        int distortion;
-        int labelyrate;
-        ENTROPY_CONTEXT t_above_s[4], t_left_s[4];
         const struct buf_2d orig_src = x->plane[0].src;
         struct buf_2d orig_pre[2];
 
-        vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
+        mode_idx = inter_mode_offset(this_mode);
+        bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
+
+        // if we're near/nearest and mv == 0,0, compare to zeromv
+        if ((this_mode == NEARMV || this_mode == NEARESTMV ||
+             this_mode == ZEROMV) &&
+            frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 &&
+            (mbmi->ref_frame[1] <= 0 ||
+             frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) {
+          int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]];
+          int c1 = cost_mv_ref(cpi, NEARMV, rfc);
+          int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
+          int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+
+          if (this_mode == NEARMV) {
+            if (c1 > c3)
+              continue;
+          } else if (this_mode == NEARESTMV) {
+            if (c2 > c3)
+              continue;
+          } else {
+            assert(this_mode == ZEROMV);
+            if (mbmi->ref_frame[1] <= 0) {
+              if ((c3 >= c2 &&
+                   frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
+                  (c3 >= c1 &&
+                   frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
+                continue;
+            } else {
+              if ((c3 >= c2 &&
+                   frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
+                   frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
+                  (c3 >= c1 &&
+                   frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
+                   frame_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
+                continue;
+            }
+          }
+        }
 
-        vpx_memcpy(t_above_s, t_above, sizeof(t_above_s));
-        vpx_memcpy(t_left_s, t_left, sizeof(t_left_s));
+        vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
+        vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
+                   sizeof(bsi->rdstat[i][mode_idx].ta));
+        vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
+                   sizeof(bsi->rdstat[i][mode_idx].tl));
 
         // motion search for newmv (single predictor case only)
-        if (mbmi->ref_frame[1] <= 0 && this_mode == NEWMV) {
+        if (mbmi->ref_frame[1] <= 0 && this_mode == NEWMV &&
+            seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
           int step_param = 0;
           int further_steps;
           int thissme, bestsme = INT_MAX;
@@ -1361,7 +1940,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
           /* Is the best so far sufficiently good that we cant justify doing
            * and new motion search. */
-          if (best_label_rd < label_mv_thresh)
+          if (best_rd < label_mv_thresh)
             break;
 
           if (cpi->compressor_speed) {
@@ -1372,9 +1951,24 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
               if (i == 2)
                 bsi->mvp.as_int =
                 x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int;
-              step_param = 2;
             }
           }
+          if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
+            // Take wtd average of the step_params based on the last frame's
+            // max mv magnitude and the best ref mvs of the current block for
+            // the given reference.
+            if (i == 0)
+              step_param = (vp9_init_search_range(
+                  cpi, x->max_mv_context[mbmi->ref_frame[0]]) +
+                  cpi->mv_step_param) >> 1;
+            else
+              step_param = (vp9_init_search_range(
+                  cpi, MAX(abs(bsi->mvp.as_mv.row),
+                           abs(bsi->mvp.as_mv.col)) >> 3) +
+                  cpi->mv_step_param) >> 1;
+          } else {
+            step_param = cpi->mv_step_param;
+          }
 
           further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
 
@@ -1424,14 +2018,17 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
-        } else if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV) {
+        }
+
+        if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV &&
+            mbmi->interp_filter == vp9_switchable_interp[0]) {
           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
             continue;
 
           // adjust src pointers
           mi_buf_shift(x, i);
-          if (cpi->sf.comp_inter_joint_search_thresh < bsize) {
+          if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
             int rate_mv;
             joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
                                 mi_row, mi_col, seg_mvs[i],
@@ -1445,146 +2042,209 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
-        rate = labels2mode(x, i, this_mode, &mode_mv[this_mode],
-                           &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
-                           bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
-                           x->mvcost, cpi);
+        bsi->rdstat[i][mode_idx].brate =
+            labels2mode(x, i, this_mode, &mode_mv[this_mode],
+                        &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
+                        bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+                        x->mvcost, cpi);
+
+        bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int;
+        if (num_4x4_blocks_wide > 1)
+          bsi->rdstat[i + 1][mode_idx].mvs[0].as_int =
+              mode_mv[this_mode].as_int;
+        if (num_4x4_blocks_high > 1)
+          bsi->rdstat[i + 2][mode_idx].mvs[0].as_int =
+              mode_mv[this_mode].as_int;
+        if (mbmi->ref_frame[1] > 0) {
+          bsi->rdstat[i][mode_idx].mvs[1].as_int =
+              second_mode_mv[this_mode].as_int;
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[i + 1][mode_idx].mvs[1].as_int =
+                second_mode_mv[this_mode].as_int;
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[i + 2][mode_idx].mvs[1].as_int =
+                second_mode_mv[this_mode].as_int;
+        }
 
         // Trap vectors that reach beyond the UMV borders
-        if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
-            ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
-            ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
-            ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
+        if (mv_check_bounds(x, &mode_mv[this_mode]))
           continue;
-        }
         if (mbmi->ref_frame[1] > 0 &&
             mv_check_bounds(x, &second_mode_mv[this_mode]))
           continue;
 
-        this_rd = encode_inter_mb_segment(&cpi->common,
-                                          x, i, &labelyrate,
-                                          &distortion, t_above_s, t_left_s);
-        this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
-        rate += labelyrate;
+        if (filter_idx > 0) {
+          BEST_SEG_INFO *ref_bsi = bsi_buf;
+          subpelmv = (mode_mv[this_mode].as_mv.row & 0x0f) ||
+                     (mode_mv[this_mode].as_mv.col & 0x0f);
+          have_ref = mode_mv[this_mode].as_int ==
+                     ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
+          if (mbmi->ref_frame[1] > 0) {
+            subpelmv |= (second_mode_mv[this_mode].as_mv.row & 0x0f) ||
+                        (second_mode_mv[this_mode].as_mv.col & 0x0f);
+            have_ref  &= second_mode_mv[this_mode].as_int ==
+                         ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
+          }
+
+          if (filter_idx > 1 && !subpelmv && !have_ref) {
+            ref_bsi = bsi_buf + 1;
+            have_ref = mode_mv[this_mode].as_int ==
+                       ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
+            if (mbmi->ref_frame[1] > 0) {
+              have_ref  &= second_mode_mv[this_mode].as_int ==
+                           ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
+            }
+          }
+
+          if (!subpelmv && have_ref &&
+              ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
+            vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
+                       sizeof(SEG_RDSTAT));
+            if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+              mode_selected = this_mode;
+              best_rd = bsi->rdstat[i][mode_idx].brdcost;
+            }
+            continue;
+          }
+        }
+
+        bsi->rdstat[i][mode_idx].brdcost =
+            encode_inter_mb_segment(cpi, x,
+                                    bsi->segment_rd - this_segment_rd, i,
+                                    &bsi->rdstat[i][mode_idx].byrate,
+                                    &bsi->rdstat[i][mode_idx].bdist,
+                                    &bsi->rdstat[i][mode_idx].bsse,
+                                    bsi->rdstat[i][mode_idx].ta,
+                                    bsi->rdstat[i][mode_idx].tl);
+        if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
+          bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
+                                            bsi->rdstat[i][mode_idx].brate, 0);
+          bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
+          bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i];
+        }
 
-        if (this_rd < best_label_rd) {
-          sbr = rate;
-          sbd = distortion;
-          bestlabelyrate = labelyrate;
+        if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
           mode_selected = this_mode;
-          best_label_rd = this_rd;
-          best_eobs[i] = x->e_mbd.plane[0].eobs[i];
-          vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));
-          vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));
+          best_rd = bsi->rdstat[i][mode_idx].brdcost;
         }
       } /*for each 4x4 mode*/
 
-      vpx_memcpy(t_above, t_above_b, sizeof(t_above));
-      vpx_memcpy(t_left, t_left_b, sizeof(t_left));
+      if (best_rd == INT64_MAX) {
+        int iy, midx;
+        for (iy = i + 1; iy < 4; ++iy)
+          for (midx = 0; midx < VP9_INTER_MODES; ++midx)
+            bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+        bsi->segment_rd = INT64_MAX;
+        return;
+      }
+
+      mode_idx = inter_mode_offset(mode_selected);
+      vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
+      vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
 
       labels2mode(x, i, mode_selected, &mode_mv[mode_selected],
                   &second_mode_mv[mode_selected], frame_mv, seg_mvs[i],
                   bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
                   x->mvcost, cpi);
 
-      br += sbr;
-      bd += sbd;
-      segmentyrate += bestlabelyrate;
-      this_segment_rd += best_label_rd;
-      other_segment_rd += best_other_rd;
+      br += bsi->rdstat[i][mode_idx].brate;
+      bd += bsi->rdstat[i][mode_idx].bdist;
+      block_sse += bsi->rdstat[i][mode_idx].bsse;
+      segmentyrate += bsi->rdstat[i][mode_idx].byrate;
+      this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
+
+      if (this_segment_rd > bsi->segment_rd) {
+        int iy, midx;
+        for (iy = i + 1; iy < 4; ++iy)
+          for (midx = 0; midx < VP9_INTER_MODES; ++midx)
+            bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+        bsi->segment_rd = INT64_MAX;
+        return;
+      }
 
-      for (j = 1; j < bh; ++j)
+      for (j = 1; j < num_4x4_blocks_high; ++j)
         vpx_memcpy(&x->partition_info->bmi[i + j * 2],
                    &x->partition_info->bmi[i],
                    sizeof(x->partition_info->bmi[i]));
-      for (j = 1; j < bw; ++j)
+      for (j = 1; j < num_4x4_blocks_wide; ++j)
         vpx_memcpy(&x->partition_info->bmi[i + j],
                    &x->partition_info->bmi[i],
                    sizeof(x->partition_info->bmi[i]));
     }
   } /* for each label */
 
-  if (this_segment_rd < bsi->segment_rd) {
-    bsi->r = br;
-    bsi->d = bd;
-    bsi->segment_yrate = segmentyrate;
-    bsi->segment_rd = this_segment_rd;
+  bsi->r = br;
+  bsi->d = bd;
+  bsi->segment_yrate = segmentyrate;
+  bsi->segment_rd = this_segment_rd;
+  bsi->sse = block_sse;
 
-    // store everything needed to come back to this!!
-    for (i = 0; i < 4; i++) {
-      bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
-      if (mbmi->ref_frame[1] > 0)
-        bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
-      bsi->modes[i] = x->partition_info->bmi[i].mode;
-      bsi->eobs[i] = best_eobs[i];
-    }
-  }
+  // update the coding decisions
+  for (i = 0; i < 4; ++i)
+    bsi->modes[i] = x->partition_info->bmi[i].mode;
 }
 
-static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
-                                       int_mv *best_ref_mv,
-                                       int_mv *second_best_ref_mv,
-                                       int64_t best_rd,
-                                       int *returntotrate,
-                                       int *returnyrate,
-                                       int *returndistortion,
-                                       int *skippable, int mvthresh,
-                                       int_mv seg_mvs[4][MAX_REF_FRAMES],
-                                       int mi_row, int mi_col) {
+static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
+                                           int_mv *best_ref_mv,
+                                           int_mv *second_best_ref_mv,
+                                           int64_t best_rd,
+                                           int *returntotrate,
+                                           int *returnyrate,
+                                           int64_t *returndistortion,
+                                           int *skippable, int64_t *psse,
+                                           int mvthresh,
+                                           int_mv seg_mvs[4][MAX_REF_FRAMES],
+                                           BEST_SEG_INFO *bsi_buf,
+                                           int filter_idx,
+                                           int mi_row, int mi_col) {
   int i;
-  BEST_SEG_INFO bsi;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+  BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  int mode_idx;
 
-  vpx_memset(&bsi, 0, sizeof(bsi));
+  vpx_memset(bsi, 0, sizeof(*bsi));
 
-  bsi.segment_rd = best_rd;
-  bsi.ref_mv = best_ref_mv;
-  bsi.second_ref_mv = second_best_ref_mv;
-  bsi.mvp.as_int = best_ref_mv->as_int;
-  bsi.mvthresh = mvthresh;
+  bsi->segment_rd = best_rd;
+  bsi->ref_mv = best_ref_mv;
+  bsi->second_ref_mv = second_best_ref_mv;
+  bsi->mvp.as_int = best_ref_mv->as_int;
+  bsi->mvthresh = mvthresh;
 
   for (i = 0; i < 4; i++)
-    bsi.modes[i] = ZEROMV;
+    bsi->modes[i] = ZEROMV;
 
-  rd_check_segment_txsize(cpi, x, &bsi, seg_mvs, mi_row, mi_col);
+  rd_check_segment_txsize(cpi, x, bsi_buf, filter_idx, seg_mvs, mi_row, mi_col);
 
+  if (bsi->segment_rd > best_rd)
+    return INT64_MAX;
   /* set it to the best */
   for (i = 0; i < 4; i++) {
-    x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int = bsi.mvs[i].as_int;
+    mode_idx = inter_mode_offset(bsi->modes[i]);
+    mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
     if (mbmi->ref_frame[1] > 0)
-      x->e_mbd.mode_info_context->bmi[i].as_mv[1].as_int =
-      bsi.second_mvs[i].as_int;
-    x->e_mbd.plane[0].eobs[i] = bsi.eobs[i];
+      mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
+    xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
+    x->partition_info->bmi[i].mode = bsi->modes[i];
   }
 
-  /* save partitions */
-  x->partition_info->count = 4;
-
-  for (i = 0; i < x->partition_info->count; i++) {
-    x->partition_info->bmi[i].mode = bsi.modes[i];
-    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[i].as_mv;
-    if (mbmi->ref_frame[1] > 0)
-      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[i].as_mv;
-  }
   /*
    * used to set mbmi->mv.as_int
    */
-  x->partition_info->bmi[3].mv.as_int = bsi.mvs[3].as_int;
-  if (mbmi->ref_frame[1] > 0)
-    x->partition_info->bmi[3].second_mv.as_int = bsi.second_mvs[3].as_int;
-
-  *returntotrate = bsi.r;
-  *returndistortion = bsi.d;
-  *returnyrate = bsi.segment_yrate;
+  *returntotrate = bsi->r;
+  *returndistortion = bsi->d;
+  *returnyrate = bsi->segment_yrate;
   *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8);
-  mbmi->mode = bsi.modes[3];
+  *psse = bsi->sse;
+  mbmi->mode = bsi->modes[3];
 
-  return (int)(bsi.segment_rd);
+  return bsi->segment_rd;
 }
 
 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                     uint8_t *ref_y_buffer, int ref_y_stride,
-                    int ref_frame, enum BlockSize block_size ) {
+                    int ref_frame, BLOCK_SIZE_TYPE block_size ) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int_mv this_mv;
@@ -1593,6 +2253,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
   int best_index = 0;
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
+  unsigned int max_mv = 0;
 
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
@@ -1602,6 +2263,8 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
   for (i = 0; i < MAX_MV_REF_CANDIDATES; i++) {
     this_mv.as_int = mbmi->ref_mvs[ref_frame][i].as_int;
 
+    max_mv = MAX(max_mv,
+                 MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
     // The list is at an end if we see 0 for a second time.
     if (!this_mv.as_int && zero_seen)
       break;
@@ -1625,6 +2288,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
 
   // Note the index of the mv that worked best in the reference list.
   x->mv_best_ref_index[ref_frame] = best_index;
+  x->max_mv_context[ref_frame] = max_mv;
 }
 
 static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
@@ -1633,18 +2297,18 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
                                      vp9_prob *comp_mode_p) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  int seg_ref_active = vp9_segfeature_active(xd, segment_id,
+  int seg_ref_active = vp9_segfeature_active(&xd->seg, segment_id,
                                              SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
     vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
     vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
     *comp_mode_p = 128;
   } else {
-    vp9_prob intra_inter_p = vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER);
+    vp9_prob intra_inter_p = vp9_get_pred_prob_intra_inter(cm, xd);
     vp9_prob comp_inter_p = 128;
 
     if (cm->comp_pred_mode == HYBRID_PREDICTION) {
-      comp_inter_p = vp9_get_pred_prob(cm, xd, PRED_COMP_INTER_INTER);
+      comp_inter_p = vp9_get_pred_prob_comp_inter_inter(cm, xd);
       *comp_mode_p = comp_inter_p;
     } else {
       *comp_mode_p = 128;
@@ -1653,8 +2317,8 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
 
     if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
-      vp9_prob ref_single_p1 = vp9_get_pred_prob(cm, xd, PRED_SINGLE_REF_P1);
-      vp9_prob ref_single_p2 = vp9_get_pred_prob(cm, xd, PRED_SINGLE_REF_P2);
+      vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+      vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
       if (cm->comp_pred_mode == HYBRID_PREDICTION)
@@ -1673,7 +2337,7 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
       ref_costs_single[ALTREF_FRAME] = 512;
     }
     if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) {
-      vp9_prob ref_comp_p = vp9_get_pred_prob(cm, xd, PRED_COMP_REF_P);
+      vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
       if (cm->comp_pred_mode == HYBRID_PREDICTION)
@@ -1689,12 +2353,13 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
 }
 
 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
-                                 int mode_index,
-                                 PARTITION_INFO *partition,
-                                 int_mv *ref_mv,
-                                 int_mv *second_ref_mv,
-                                 int64_t comp_pred_diff[NB_PREDICTION_TYPES],
-                                 int64_t txfm_size_diff[NB_TXFM_MODES]) {
+                         int mode_index,
+                         PARTITION_INFO *partition,
+                         int_mv *ref_mv,
+                         int_mv *second_ref_mv,
+                         int64_t comp_pred_diff[NB_PREDICTION_TYPES],
+                         int64_t txfm_size_diff[NB_TXFM_MODES],
+                         int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
@@ -1713,7 +2378,11 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   ctx->comp_pred_diff   = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
   ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
 
+  // FIXME(rbultje) does this memcpy the whole array? I believe sizeof()
+  // doesn't actually work this way
   memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
+  memcpy(ctx->best_filter_diff, best_filter_diff,
+         sizeof(*best_filter_diff) * (VP9_SWITCHABLE_FILTERS + 1));
 }
 
 static void setup_pred_block(const MACROBLOCKD *xd,
@@ -1744,7 +2413,7 @@ static void setup_pred_block(const MACROBLOCKD *xd,
 
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                int idx, MV_REFERENCE_FRAME frame_type,
-                               enum BlockSize block_size,
+                               BLOCK_SIZE_TYPE block_size,
                                int mi_row, int mi_col,
                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
                                int_mv frame_near_mv[MAX_REF_FRAMES],
@@ -1786,8 +2455,8 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
   // Further refinement that is encode side only to test the top few candidates
   // in full and choose the best as the centre point for subsequent searches.
   // The current implementation doesn't support scaling.
-  if (scale[frame_type].x_scale_fp == (1 << VP9_REF_SCALE_SHIFT) &&
-      scale[frame_type].y_scale_fp == (1 << VP9_REF_SCALE_SHIFT))
+  if (scale[frame_type].x_scale_fp == VP9_REF_NO_SCALE &&
+      scale[frame_type].y_scale_fp == VP9_REF_NO_SCALE)
     mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
             frame_type, block_size);
 }
@@ -1800,93 +2469,11 @@ static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
   return scaled_ref_frame;
 }
 
-static void model_rd_from_var_lapndz(int var, int n, int qstep,
-                                     int *rate, int *dist) {
-  // This function models the rate and distortion for a Laplacian
-  // source with given variance when quantized with a uniform quantizer
-  // with given stepsize. The closed form expressions are in:
-  // Hang and Chen, "Source Model for transform video coder and its
-  // application - Part I: Fundamental Theory", IEEE Trans. Circ.
-  // Sys. for Video Tech., April 1997.
-  // The function is implemented as piecewise approximation to the
-  // exact computation.
-  // TODO(debargha): Implement the functions by interpolating from a
-  // look-up table
-  vp9_clear_system_state();
-  if (var == 0 || n == 0) {
-    *rate = 0;
-    *dist = 0;
-  } else {
-    double D, R;
-    double s2 = (double) var / n;
-    double s = sqrt(s2);
-    double x = qstep / s;
-    if (x > 1.0) {
-      double y = exp(-x / 2);
-      double y2 = y * y;
-      D = 2.069981728764738 * y2 - 2.764286806516079 * y + 1.003956960819275;
-      R = 0.924056758535089 * y2 + 2.738636469814024 * y - 0.005169662030017;
-    } else {
-      double x2 = x * x;
-      D = 0.075303187668830 * x2 + 0.004296954321112 * x - 0.000413209252807;
-      if (x > 0.125)
-        R = 1 / (-0.03459733614226 * x2 + 0.36561675733603 * x +
-                 0.1626989668625);
-      else
-        R = -1.442252874826093 * log(x) + 1.944647760719664;
-    }
-    if (R < 0) {
-      *rate = 0;
-      *dist = var;
-    } else {
-      *rate = (n * R * 256 + 0.5);
-      *dist = (n * D * s2 + 0.5);
-    }
-  }
-  vp9_clear_system_state();
-}
-
-static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,
-                                           struct macroblockd_plane *pd) {
-  return get_block_size(plane_block_width(bsize, pd),
-                        plane_block_height(bsize, pd));
-}
-
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
-                            MACROBLOCK *x, MACROBLOCKD *xd,
-                            int *out_rate_sum, int *out_dist_sum) {
-  // Note our transform coeffs are 8 times an orthogonal transform.
-  // Hence quantizer step is also 8 times. To get effective quantizer
-  // we need to divide by 8 before sending to modeling function.
-  unsigned int sse, var;
-  int i, rate_sum = 0, dist_sum = 0;
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    struct macroblock_plane *const p = &x->plane[i];
-    struct macroblockd_plane *const pd = &xd->plane[i];
-
-    // TODO(dkovalev) the same code in get_plane_block_size
-    const int bw = plane_block_width(bsize, pd);
-    const int bh = plane_block_height(bsize, pd);
-    const enum BlockSize bs = get_block_size(bw, bh);
-    int rate, dist;
-    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
-                             pd->dst.buf, pd->dst.stride, &sse);
-    model_rd_from_var_lapndz(var, bw * bh, pd->dequant[1] >> 3, &rate, &dist);
-
-    rate_sum += rate;
-    dist_sum += dist;
-  }
-
-  *out_rate_sum = rate_sum;
-  *out_dist_sum = dist_sum;
-}
-
 static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
-  const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
+  const int c = vp9_get_pred_context_switchable_interp(xd);
   const int m = vp9_switchable_interp_map[mbmi->interp_filter];
   return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
 }
@@ -1896,16 +2483,16 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                  int mi_row, int mi_col,
                                  int_mv *tmp_mv, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
+  VP9_COMMON *cm = &cpi->common;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   int bestsme = INT_MAX;
-  int further_steps, step_param = cpi->sf.first_step;
+  int further_steps, step_param;
   int sadpb = x->sadperbit16;
   int_mv mvp_full;
   int ref = mbmi->ref_frame[0];
   int_mv ref_mv = mbmi->ref_mvs[ref][0];
-  int sr = 0;
-  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
 
   int tmp_col_min = x->mv_col_min;
   int tmp_col_max = x->mv_col_max;
@@ -1922,24 +2509,48 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     for (i = 0; i < MAX_MB_PLANE; i++)
       backup_yv12[i] = xd->plane[i].pre[0];
 
-    setup_pre_planes(xd, scaled_ref_frame, NULL, mi_row, mi_col,
-                     NULL, NULL);
+    setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
   vp9_clamp_mv_min_max(x, &ref_mv);
 
-  sr = vp9_init_search_range(cpi->common.width, cpi->common.height);
-
-  // mvp_full.as_int = ref_mv[0].as_int;
-  mvp_full.as_int =
-      mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int;
+  // Adjust search parameters based on small partitions' result.
+  if (x->fast_ms) {
+    // && abs(mvp_full.as_mv.row - x->pred_mv.as_mv.row) < 24 &&
+    // abs(mvp_full.as_mv.col - x->pred_mv.as_mv.col) < 24) {
+    // adjust search range
+    step_param = 6;
+    if (x->fast_ms > 1)
+      step_param = 8;
+
+    // Get prediction MV.
+    mvp_full.as_int = x->pred_mv.as_int;
+
+    // Adjust MV sign if needed.
+    if (cm->ref_frame_sign_bias[ref]) {
+      mvp_full.as_mv.col *= -1;
+      mvp_full.as_mv.row *= -1;
+    }
+  } else {
+    // Work out the size of the first step in the mv step search.
+    // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+    if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
+      // Take wtd average of the step_params based on the last frame's
+      // max mv magnitude and that based on the best ref mvs of the current
+      // block for the given reference.
+      step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
+                    cpi->mv_step_param) >> 1;
+    } else {
+      step_param = cpi->mv_step_param;
+    }
+    // mvp_full.as_int = ref_mv[0].as_int;
+    mvp_full.as_int =
+        mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int;
+  }
 
   mvp_full.as_mv.col >>= 3;
   mvp_full.as_mv.row >>= 3;
 
-  // adjust search range according to sr from mv prediction
-  step_param = MAX(step_param, sr);
-
   // Further step/diamond searches as necessary
   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
@@ -1984,7 +2595,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int refs[2] = { mbmi->ref_frame[0],
     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv ref_mv[2];
-  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   int ite;
   // Prediction buffer from second frame.
   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
@@ -2008,8 +2619,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     // motion search code to be used without additional modifications.
     for (i = 0; i < MAX_MB_PLANE; i++)
       backup_yv12[i] = xd->plane[i].pre[0];
-    setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
-                     NULL, NULL);
+    setup_pre_planes(xd, 0, scaled_ref_frame[0], mi_row, mi_col, NULL);
   }
 
   if (scaled_ref_frame[1]) {
@@ -2017,8 +2627,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     for (i = 0; i < MAX_MB_PLANE; i++)
       backup_second_yv12[i] = xd->plane[i].pre[1];
 
-    setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
-                     NULL, NULL);
+    setup_pre_planes(xd, 0, scaled_ref_frame[1], mi_row, mi_col, NULL);
   }
 
   xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
@@ -2057,7 +2666,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                               &frame_mv[refs[!id]],
                               &xd->scale_factor[!id],
                               pw, ph, 0,
-                              &xd->subpix);
+                              &xd->subpix, MV_PRECISION_Q3);
 
     // Compound motion search on first ref frame.
     if (id)
@@ -2134,35 +2743,37 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE_TYPE bsize,
                                  int64_t txfm_cache[],
-                                 int *rate2, int *distortion, int *skippable,
-                                 int *rate_y, int *distortion_y,
-                                 int *rate_uv, int *distortion_uv,
+                                 int *rate2, int64_t *distortion,
+                                 int *skippable,
+                                 int *rate_y, int64_t *distortion_y,
+                                 int *rate_uv, int64_t *distortion_uv,
                                  int *mode_excluded, int *disable_skip,
                                  INTERPOLATIONFILTERTYPE *best_filter,
-                                 int_mv *frame_mv,
+                                 int_mv (*mode_mv)[MAX_REF_FRAMES],
                                  int mi_row, int mi_col,
-                                 int_mv single_newmv[MAX_REF_FRAMES]) {
-  const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
-
+                                 int_mv single_newmv[MAX_REF_FRAMES],
+                                 int64_t *psse, int64_t ref_best_rd) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  const enum BlockSize uv_block_size = get_plane_block_size(bsize,
-                                                            &xd->plane[1]);
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   const int is_comp_pred = (mbmi->ref_frame[1] > 0);
   const int num_refs = is_comp_pred ? 2 : 1;
   const int this_mode = mbmi->mode;
+  int_mv *frame_mv = mode_mv[this_mode];
   int i;
   int refs[2] = { mbmi->ref_frame[0],
     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv cur_mv[2];
   int64_t this_rd = 0;
-  unsigned char tmp_buf[MAX_MB_PLANE][64 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
   int pred_exists = 0;
   int interpolating_intpel_seen = 0;
   int intpel_mv;
   int64_t rd, best_rd = INT64_MAX;
+  int best_needs_copy = 0;
+  uint8_t *orig_dst[MAX_MB_PLANE];
+  int orig_dst_stride[MAX_MB_PLANE];
+  int rs = 0;
 
   switch (this_mode) {
     int rate_mv;
@@ -2172,7 +2783,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
         frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
-        if (cpi->sf.comp_inter_joint_search_thresh < bsize) {
+        if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
           joint_motion_search(cpi, x, bsize, frame_mv,
                               mi_row, mi_col, single_newmv, &rate_mv);
         } else {
@@ -2189,7 +2800,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
             frame_mv[refs[1]].as_int == INVALID_MV)
           return INT64_MAX;
         *rate2 += rate_mv;
-
       } else {
         int_mv tmp_mv;
         single_motion_search(cpi, x, bsize, mi_row, mi_col,
@@ -2206,6 +2816,43 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     default:
       break;
   }
+
+  // if we're near/nearest and mv == 0,0, compare to zeromv
+  if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
+      frame_mv[refs[0]].as_int == 0 &&
+      !vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
+      (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) {
+    int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]];
+    int c1 = cost_mv_ref(cpi, NEARMV, rfc);
+    int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
+    int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+
+    if (this_mode == NEARMV) {
+      if (c1 > c3)
+        return INT64_MAX;
+    } else if (this_mode == NEARESTMV) {
+      if (c2 > c3)
+        return INT64_MAX;
+    } else {
+      assert(this_mode == ZEROMV);
+      if (num_refs == 1) {
+        if ((c3 >= c2 &&
+             mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
+            (c3 >= c1 &&
+             mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
+          return INT64_MAX;
+      } else {
+        if ((c3 >= c2 &&
+             mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
+             mode_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
+            (c3 >= c1 &&
+             mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
+             mode_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
+          return INT64_MAX;
+      }
+    }
+  }
+
   for (i = 0; i < num_refs; ++i) {
     cur_mv[i] = frame_mv[refs[i]];
     // Clip "next_nearest" so that it does not extend to far out of image
@@ -2219,12 +2866,30 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
 
+  // do first prediction into the destination buffer. Do the next
+  // prediction into a temporary buffer. Then keep track of which one
+  // of these currently holds the best predictor, and use the other
+  // one for future predictions. In the end, copy from tmp_buf to
+  // dst if necessary.
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    orig_dst[i] = xd->plane[i].dst.buf;
+    orig_dst_stride[i] = xd->plane[i].dst.stride;
+  }
+
   /* We don't include the cost of the second reference here, because there
    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
    * words if you present them in that order, the second one is always known
    * if the first is known */
-  *rate2 += vp9_cost_mv_ref(cpi, this_mode,
-                            mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+  *rate2 += cost_mv_ref(cpi, this_mode,
+                        mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+
+  if (!(*mode_excluded)) {
+    if (is_comp_pred) {
+      *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
+    } else {
+      *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
+    }
+  }
 
   pred_exists = 0;
   interpolating_intpel_seen = 0;
@@ -2236,78 +2901,113 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         (mbmi->mv[1].as_mv.col & 15) == 0;
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
-  if (cpi->speed > 4) {
+  *best_filter = EIGHTTAP;
+  if (cpi->sf.use_8tap_always) {
     *best_filter = EIGHTTAP;
+    vp9_zero(cpi->rd_filter_cache);
   } else {
     int i, newbest;
-    int tmp_rate_sum = 0, tmp_dist_sum = 0;
+    int tmp_rate_sum = 0;
+    int64_t tmp_dist_sum = 0;
+
+    cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
     for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-      int rs = 0;
+      int j;
+      int64_t rs_rd;
       const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
-      const int is_intpel_interp = intpel_mv &&
-          vp9_is_interpolating_filter[filter];
+      const int is_intpel_interp = intpel_mv;
       mbmi->interp_filter = filter;
       vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
-      if (cm->mcomp_filter_type == SWITCHABLE)
-        rs = get_switchable_rate(cm, x);
+      rs = get_switchable_rate(cm, x);
+      rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
 
       if (interpolating_intpel_seen && is_intpel_interp) {
-        rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);
+        cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
+                                         tmp_rate_sum, tmp_dist_sum);
+        cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
+            MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS],
+                cpi->rd_filter_cache[i] + rs_rd);
+        rd = cpi->rd_filter_cache[i];
+        if (cm->mcomp_filter_type == SWITCHABLE)
+          rd += rs_rd;
       } else {
-        int rate_sum = 0, dist_sum = 0;
+        int rate_sum = 0;
+        int64_t dist_sum = 0;
+        if ((cm->mcomp_filter_type == SWITCHABLE &&
+             (!i || best_needs_copy)) ||
+            (cm->mcomp_filter_type != SWITCHABLE &&
+             (cm->mcomp_filter_type == mbmi->interp_filter ||
+              (!interpolating_intpel_seen && is_intpel_interp)))) {
+          for (j = 0; j < MAX_MB_PLANE; j++) {
+            xd->plane[j].dst.buf = orig_dst[j];
+            xd->plane[j].dst.stride = orig_dst_stride[j];
+          }
+        } else {
+          for (j = 0; j < MAX_MB_PLANE; j++) {
+            xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
+            xd->plane[j].dst.stride = 64;
+          }
+        }
         vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
         model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
-        rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);
+        cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
+                                         rate_sum, dist_sum);
+        cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
+            MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS],
+                cpi->rd_filter_cache[i] + rs_rd);
+        rd = cpi->rd_filter_cache[i];
+        if (cm->mcomp_filter_type == SWITCHABLE)
+          rd += rs_rd;
         if (!interpolating_intpel_seen && is_intpel_interp) {
           tmp_rate_sum = rate_sum;
           tmp_dist_sum = dist_sum;
         }
       }
+      if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+        if (rd / 2 > ref_best_rd) {
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = orig_dst[i];
+            xd->plane[i].dst.stride = orig_dst_stride[i];
+          }
+          return INT64_MAX;
+        }
+      }
       newbest = i == 0 || rd < best_rd;
 
       if (newbest) {
         best_rd = rd;
         *best_filter = mbmi->interp_filter;
+        if (cm->mcomp_filter_type == SWITCHABLE && i &&
+            !(interpolating_intpel_seen && is_intpel_interp))
+          best_needs_copy = !best_needs_copy;
       }
 
       if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
           (cm->mcomp_filter_type != SWITCHABLE &&
            cm->mcomp_filter_type == mbmi->interp_filter)) {
-        int p;
-
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
-          const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
-          int i;
-
-          for (i = 0; i < y; i++)
-            vpx_memcpy(&tmp_buf[p][64 * i],
-                       xd->plane[p].dst.buf + i * xd->plane[p].dst.stride, x);
-        }
         pred_exists = 1;
       }
       interpolating_intpel_seen |= is_intpel_interp;
     }
-  }
 
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].dst.buf = orig_dst[i];
+      xd->plane[i].dst.stride = orig_dst_stride[i];
+    }
+  }
   // Set the appripriate filter
   mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
       cm->mcomp_filter_type : *best_filter;
   vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
+  rs = (cm->mcomp_filter_type == SWITCHABLE ? get_switchable_rate(cm, x) : 0);
 
   if (pred_exists) {
-    int p;
-
-    for (p = 0; p < MAX_MB_PLANE; p++) {
-      const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
-      const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
-      int i;
-
-      for (i = 0; i < y; i++)
-        vpx_memcpy(xd->plane[p].dst.buf + i * xd->plane[p].dst.stride,
-                   &tmp_buf[p][64 * i], x);
+    if (best_needs_copy) {
+      // again temporarily set the buffers to local memory to prevent a memcpy
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
+        xd->plane[i].dst.stride = 64;
+      }
     }
   } else {
     // Handles the special case when a filter that is not in the
@@ -2315,42 +3015,60 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   }
 
+
+  if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+    int tmp_rate;
+    int64_t tmp_dist;
+    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
+    rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+    // if current pred_error modeled rd is substantially more than the best
+    // so far, do not bother doing full rd
+    if (rd / 2 > ref_best_rd) {
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = orig_dst[i];
+        xd->plane[i].dst.stride = orig_dst_stride[i];
+      }
+      return INT64_MAX;
+    }
+  }
+
   if (cpi->common.mcomp_filter_type == SWITCHABLE)
     *rate2 += get_switchable_rate(cm, x);
 
   if (cpi->active_map_enabled && x->active_ptr[0] == 0)
     x->skip = 1;
   else if (x->encode_breakout) {
+    const BLOCK_SIZE_TYPE y_size = get_plane_block_size(bsize, &xd->plane[0]);
+    const BLOCK_SIZE_TYPE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
+
     unsigned int var, sse;
-    int threshold = (xd->plane[0].dequant[1]
-                     * xd->plane[0].dequant[1] >> 4);
+    int threshold = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1] >> 4);
+
 
     if (threshold < x->encode_breakout)
       threshold = x->encode_breakout;
 
-    var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,
-                                     x->plane[0].src.stride,
-                                     xd->plane[0].dst.buf,
-                                     xd->plane[0].dst.stride,
-                                     &sse);
+    var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                 xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+                                 &sse);
 
     if ((int)sse < threshold) {
       unsigned int q2dc = xd->plane[0].dequant[0];
-      /* If there is no codeable 2nd order dc
-         or a very small uniform pixel change change */
+      // If there is no codeable 2nd order dc
+      // or a very small uniform pixel change change
       if ((sse - var < q2dc * q2dc >> 4) ||
           (sse / 2 > var && sse - var < 64)) {
         // Check u and v to make sure skip is ok
         int sse2;
         unsigned int sse2u, sse2v;
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[1].dst.buf,
-                                            xd->plane[1].dst.stride, &sse2u);
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[2].dst.buf,
-                                            xd->plane[1].dst.stride, &sse2v);
+        var = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
+                                      x->plane[1].src.stride,
+                                      xd->plane[1].dst.buf,
+                                      xd->plane[1].dst.stride, &sse2u);
+        var = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
+                                      x->plane[2].src.stride,
+                                      xd->plane[2].dst.buf,
+                                      xd->plane[2].dst.stride, &sse2v);
         sse2 = sse2u + sse2v;
 
         if (sse2 * 2 < threshold) {
@@ -2358,7 +3076,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           *distortion = sse + sse2;
           *rate2 = 500;
 
-          /* for best_yrd calculation */
+          // for best yrd calculation
           *rate_uv = 0;
           *distortion_uv = sse2;
 
@@ -2371,89 +3089,91 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   if (!x->skip) {
     int skippable_y, skippable_uv;
+    int64_t sseuv = INT_MAX;
 
     // Y cost and distortion
-    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y,
-                    bsize, txfm_cache);
+    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
+                    bsize, txfm_cache, ref_best_rd);
+
+    if (*rate_y == INT_MAX) {
+      *rate2 = INT_MAX;
+      *distortion = INT64_MAX;
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = orig_dst[i];
+        xd->plane[i].dst.stride = orig_dst_stride[i];
+      }
+      return INT64_MAX;
+    }
 
     *rate2 += *rate_y;
     *distortion += *distortion_y;
 
     super_block_uvrd(cm, x, rate_uv, distortion_uv,
-                     &skippable_uv, bsize);
+                     &skippable_uv, &sseuv, bsize);
 
+    *psse += sseuv;
     *rate2 += *rate_uv;
     *distortion += *distortion_uv;
     *skippable = skippable_y && skippable_uv;
   }
 
-  if (!(*mode_excluded)) {
-    if (is_comp_pred) {
-      *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
-    } else {
-      *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
-    }
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = orig_dst[i];
+    xd->plane[i].dst.stride = orig_dst_stride[i];
   }
 
   return this_rd;  // if 0, this will be re-calculated by caller
 }
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                               int *returnrate, int *returndist,
+                               int *returnrate, int64_t *returndist,
                                BLOCK_SIZE_TYPE bsize,
-                               PICK_MODE_CONTEXT *ctx) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int rate_y = 0, rate_uv;
-  int rate_y_tokenonly = 0, rate_uv_tokenonly;
-  int dist_y = 0, dist_uv;
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
   int y_skip = 0, uv_skip;
-  int64_t txfm_cache[NB_TXFM_MODES], err;
-  MB_PREDICTION_MODE mode;
-  TX_SIZE txfm_size;
-  int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y;
-  int64_t err4x4 = INT64_MAX;
-  int i;
+  int64_t dist_y = 0, dist_uv = 0, txfm_cache[NB_TXFM_MODES];
 
-  vpx_memset(&txfm_cache,0,sizeof(txfm_cache));
+  x->skip_encode = 0;
+  vpx_memset(&txfm_cache, 0, sizeof(txfm_cache));
   ctx->skip = 0;
-  xd->mode_info_context->mbmi.mode = DC_PRED;
   xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME;
-  err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                               &dist_y, &y_skip, bsize, txfm_cache);
-  mode = xd->mode_info_context->mbmi.mode;
-  txfm_size = xd->mode_info_context->mbmi.txfm_size;
-  rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                          &dist_uv, &uv_skip,
-                          (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
-                                                       bsize);
-  if (bsize < BLOCK_SIZE_SB8X8)
-    err4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4_y,
-                                       &rate4x4_y_tokenonly,
-                                       &dist4x4_y, err);
+  if (bsize >= BLOCK_SIZE_SB8X8) {
+    if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                               &dist_y, &y_skip, bsize, txfm_cache,
+                               best_rd) >= best_rd) {
+      *returnrate = INT_MAX;
+      return;
+    }
+    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                            &dist_uv, &uv_skip, bsize);
+  } else {
+    y_skip = 0;
+    if (rd_pick_intra4x4mby_modes(cpi, x, &rate_y, &rate_y_tokenonly,
+                                  &dist_y, best_rd) >= best_rd) {
+      *returnrate = INT_MAX;
+      return;
+    }
+    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                            &dist_uv, &uv_skip, BLOCK_SIZE_SB8X8);
+  }
 
   if (y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
-                  vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
+                  vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
     *returndist = dist_y + (dist_uv >> 2);
     memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
-    xd->mode_info_context->mbmi.mode = mode;
-    xd->mode_info_context->mbmi.txfm_size = txfm_size;
-  } else if (bsize < BLOCK_SIZE_SB8X8 && err4x4 < err) {
-    *returnrate = rate4x4_y + rate_uv +
-        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-    *returndist = dist4x4_y + (dist_uv >> 2);
-    vpx_memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
-    xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   } else {
+    int i;
     *returnrate = rate_y + rate_uv +
-        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+        vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
     *returndist = dist_y + (dist_uv >> 2);
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode];
+    if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->tx_mode];
+      }
     }
-    xd->mode_info_context->mbmi.txfm_size = txfm_size;
-    xd->mode_info_context->mbmi.mode = mode;
   }
 
   ctx->mic = *xd->mode_info_context;
@@ -2462,15 +3182,15 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
                                   int *returnrate,
-                                  int *returndistortion,
+                                  int64_t *returndistortion,
                                   BLOCK_SIZE_TYPE bsize,
-                                  PICK_MODE_CONTEXT *ctx) {
+                                  PICK_MODE_CONTEXT *ctx,
+                                  int64_t best_rd_so_far) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   MB_PREDICTION_MODE this_mode;
-  MB_PREDICTION_MODE best_mode = DC_PRED;
   MV_REFERENCE_FRAME ref_frame;
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
   int comp_pred, i;
@@ -2483,21 +3203,28 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                      cpi->lst_fb_idx,
                      cpi->gld_fb_idx,
                      cpi->alt_fb_idx};
-  int64_t best_rd = INT64_MAX;
+  int64_t best_rd = best_rd_so_far;
+  int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_txfm_rd[NB_TXFM_MODES];
   int64_t best_txfm_diff[NB_TXFM_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   int64_t best_pred_rd[NB_PREDICTION_TYPES];
+  int64_t best_filter_rd[VP9_SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
   MB_MODE_INFO best_mbmode;
   int j;
   int mode_index, best_mode_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   vp9_prob comp_mode_p;
-  int64_t best_overall_rd = INT64_MAX;
-  INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
+  int64_t best_intra_rd = INT64_MAX;
+  int64_t best_inter_rd = INT64_MAX;
+  MB_PREDICTION_MODE best_intra_mode = DC_PRED;
+  // MB_PREDICTION_MODE best_inter_mode = ZEROMV;
+  MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
   INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
   int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB];
-  int dist_uv[TX_SIZE_MAX_SB], skip_uv[TX_SIZE_MAX_SB];
+  int64_t dist_uv[TX_SIZE_MAX_SB];
+  int skip_uv[TX_SIZE_MAX_SB];
   MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB];
   struct scale_factors scale_factor[4];
   unsigned int ref_frame_mask = 0;
@@ -2513,10 +3240,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int bws = (1 << bwsl) / 4;  // mode_info step for subsize
   int bhsl = b_height_log2(bsize);
   int bhs = (1 << bhsl) / 4;  // mode_info step for subsize
+  int best_skip2 = 0;
+
+  x->skip_encode = (cpi->sf.skip_encode_frame &&
+                    xd->q_index < QIDX_SKIP_THRESH);
 
   for (i = 0; i < 4; i++) {
     int j;
-
     for (j = 0; j < MAX_REF_FRAMES; j++)
       seg_mvs[i][j].as_int = INVALID_MV;
   }
@@ -2534,9 +3264,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < NB_TXFM_MODES; i++)
     best_txfm_rd[i] = INT64_MAX;
+  for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
+    best_filter_rd[i] = INT64_MAX;
+  for (i = 0; i < TX_SIZE_MAX_SB; i++)
+    rate_uv_intra[i] = INT_MAX;
+
+  *returnrate = INT_MAX;
 
   // Create a mask set to 1 for each frame used by a smaller resolution.
-  if (cpi->speed > 0) {
+  if (cpi->sf.use_avoid_tested_higherror) {
     switch (block_size) {
       case BLOCK_64X64:
         for (i = 0; i < 4; i++) {
@@ -2576,22 +3312,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
-  if (cpi->speed == 0
-      || (cpi->speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) {
-    mbmi->mode = DC_PRED;
-    mbmi->ref_frame[0] = INTRA_FRAME;
-    for (i = 0; i <= (bsize < BLOCK_SIZE_MB16X16 ? TX_4X4 :
-                      (bsize < BLOCK_SIZE_SB32X32 ? TX_8X8 :
-                       (bsize < BLOCK_SIZE_SB64X64 ? TX_16X16 : TX_32X32)));
-         i++) {
-      mbmi->txfm_size = i;
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[i], &rate_uv_tokenonly[i],
-                              &dist_uv[i], &skip_uv[i],
-                              (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
-                                                           bsize);
-      mode_uv[i] = mbmi->uv_mode;
-    }
-  }
 
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
     int mode_excluded = 0;
@@ -2599,14 +3319,30 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     int disable_skip = 0;
     int compmode_cost = 0;
     int rate2 = 0, rate_y = 0, rate_uv = 0;
-    int distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable;
     int64_t txfm_cache[NB_TXFM_MODES];
     int i;
+    int this_skip2 = 0;
+    int64_t total_sse = INT_MAX;
+    int early_term = 0;
 
     for (i = 0; i < NB_TXFM_MODES; ++i)
       txfm_cache[i] = INT64_MAX;
 
+    this_mode = vp9_mode_order[mode_index].mode;
+    ref_frame = vp9_mode_order[mode_index].ref_frame;
+
+    // Slip modes that have been masked off but always consider first mode.
+    if ( mode_index && (bsize > cpi->sf.unused_mode_skip_lvl) &&
+         (cpi->unused_mode_skip_mask & (1 << mode_index)) )
+      continue;
+
+    // Skip if the current refernce frame has been masked off
+    if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
+        (cpi->ref_frame_mask & (1 << ref_frame)))
+      continue;
+
     // Test best rd so far against threshold for trying this mode.
     if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] *
                      cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 4)) ||
@@ -2616,14 +3352,18 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     // Do not allow compound prediction if the segment level reference
     // frame feature is in use as in this case there can only be one reference.
     if ((vp9_mode_order[mode_index].second_ref_frame > INTRA_FRAME) &&
-         vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME))
+         vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME))
       continue;
 
     x->skip = 0;
-    this_mode = vp9_mode_order[mode_index].mode;
-    ref_frame = vp9_mode_order[mode_index].ref_frame;
 
-    if (cpi->speed > 0 && bsize >= BLOCK_SIZE_SB8X8) {
+    // Skip some checking based on small partitions' result.
+    if (x->fast_ms > 1 && !ref_frame)
+      continue;
+    if (x->fast_ms > 2 && ref_frame != x->subblock_ref)
+      continue;
+
+    if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_SIZE_SB8X8) {
       if (!(ref_frame_mask & (1 << ref_frame))) {
         continue;
       }
@@ -2649,27 +3389,32 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       continue;
     }
 
+    comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+    if (comp_pred) {
+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
+        if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
+          continue;
+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
+        if (vp9_mode_order[mode_index].ref_frame != best_inter_ref_frame &&
+            vp9_mode_order[mode_index].second_ref_frame != best_inter_ref_frame)
+          continue;
+    }
     // TODO(jingning, jkoleszar): scaling reference frame not supported for
     // SPLITMV.
     if (mbmi->ref_frame[0] > 0 &&
-          (scale_factor[mbmi->ref_frame[0]].x_scale_fp !=
-           (1 << VP9_REF_SCALE_SHIFT) ||
-           scale_factor[mbmi->ref_frame[0]].y_scale_fp !=
-           (1 << VP9_REF_SCALE_SHIFT)) &&
+        (scale_factor[mbmi->ref_frame[0]].x_scale_fp != VP9_REF_NO_SCALE ||
+         scale_factor[mbmi->ref_frame[0]].y_scale_fp != VP9_REF_NO_SCALE) &&
         this_mode == SPLITMV)
       continue;
 
     if (mbmi->ref_frame[1] > 0 &&
-          (scale_factor[mbmi->ref_frame[1]].x_scale_fp !=
-           (1 << VP9_REF_SCALE_SHIFT) ||
-           scale_factor[mbmi->ref_frame[1]].y_scale_fp !=
-           (1 << VP9_REF_SCALE_SHIFT)) &&
+        (scale_factor[mbmi->ref_frame[1]].x_scale_fp != VP9_REF_NO_SCALE ||
+         scale_factor[mbmi->ref_frame[1]].y_scale_fp != VP9_REF_NO_SCALE) &&
         this_mode == SPLITMV)
       continue;
 
     set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
                       scale_factor);
-    comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
     mbmi->mode = this_mode;
     mbmi->uv_mode = DC_PRED;
 
@@ -2691,9 +3436,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
                         scale_factor);
 
-      mode_excluded =
-          mode_excluded ?
-              mode_excluded : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+      mode_excluded = mode_excluded
+                         ? mode_excluded
+                         : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
     } else {
       // mbmi->ref_frame[1] = vp9_mode_order[mode_index].ref_frame[1];
       if (ref_frame != INTRA_FRAME) {
@@ -2713,23 +3458,31 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+    if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME) &&
+        vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) !=
+            (int)ref_frame) {
       continue;
     // If the segment skip feature is enabled....
     // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) &&
+    } else if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP) &&
                (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
       continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
     // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {
+    } else if (!vp9_segfeature_active(&xd->seg, segment_id,
+                                      SEG_LVL_REF_FRAME)) {
       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
       // unless ARNR filtering is enabled in which case we want
-      // an unfiltered alternative
+      // an unfiltered alternative. We allow near/nearest as well
+      // because they may result in zero-zero MVs but be cheaper.
       if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if (this_mode != ZEROMV || ref_frame != ALTREF_FRAME) {
+        if ((this_mode != ZEROMV &&
+             !(this_mode == NEARMV &&
+               frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
+             !(this_mode == NEARESTMV &&
+               frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
+            ref_frame != ALTREF_FRAME) {
           continue;
         }
       }
@@ -2747,6 +3500,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (this_mode == I4X4_PRED) {
       int rate;
 
+      /*
+      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+          (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME))
+        continue;
+        */
+
       mbmi->txfm_size = TX_4X4;
       rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
                                 &distortion_y, INT64_MAX);
@@ -2754,8 +3513,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       rate2 += intra_cost_penalty;
       distortion2 += distortion_y;
 
+      if (rate_uv_intra[TX_4X4] == INT_MAX) {
+        choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
+                             &rate_uv_tokenonly[TX_4X4],
+                             &dist_uv[TX_4X4], &skip_uv[TX_4X4],
+                             &mode_uv[TX_4X4]);
+      }
       rate2 += rate_uv_intra[TX_4X4];
-      rate_uv = rate_uv_intra[TX_4X4];
+      rate_uv = rate_uv_tokenonly[TX_4X4];
       distortion2 += dist_uv[TX_4X4];
       distortion_uv = dist_uv[TX_4X4];
       mbmi->uv_mode = mode_uv[TX_4X4];
@@ -2764,41 +3529,68 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         txfm_cache[i] = txfm_cache[ONLY_4X4];
     } else if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
-      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
-                      bsize, txfm_cache);
-
-      uv_tx = mbmi->txfm_size;
-      if (bsize < BLOCK_SIZE_MB16X16 && uv_tx == TX_8X8)
-        uv_tx = TX_4X4;
-      if (bsize < BLOCK_SIZE_SB32X32 && uv_tx == TX_16X16)
-        uv_tx = TX_8X8;
-      else if (bsize < BLOCK_SIZE_SB64X64 && uv_tx == TX_32X32)
-        uv_tx = TX_16X16;
-
-      rate_uv = rate_uv_intra[uv_tx];
+      // Only search the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+          (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+        if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
+          continue;
+      }
+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(mbmi->mode, best_intra_mode))
+            continue;
+      }
+      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
+                      bsize, txfm_cache, best_rd);
+
+      if (rate_y == INT_MAX)
+        continue;
+
+      uv_tx = MIN(mbmi->txfm_size, max_uv_txsize_lookup[bsize]);
+      if (rate_uv_intra[uv_tx] == INT_MAX) {
+        choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx],
+                             &rate_uv_tokenonly[uv_tx],
+                             &dist_uv[uv_tx], &skip_uv[uv_tx],
+                             &mode_uv[uv_tx]);
+      }
+
+      rate_uv = rate_uv_tokenonly[uv_tx];
       distortion_uv = dist_uv[uv_tx];
       skippable = skippable && skip_uv[uv_tx];
       mbmi->uv_mode = mode_uv[uv_tx];
 
-      rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv;
+      rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
     } else if (this_mode == SPLITMV) {
       const int is_comp_pred = mbmi->ref_frame[1] > 0;
-      int rate, distortion;
+      int rate;
+      int64_t distortion;
       int64_t this_rd_thresh;
       int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
       int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
-      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;
+      int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
+      int tmp_best_skippable = 0;
       int switchable_filter_index;
       int_mv *second_ref = is_comp_pred ?
           &mbmi->ref_mvs[mbmi->ref_frame[1]][0] : NULL;
       union b_mode_info tmp_best_bmodes[16];
       MB_MODE_INFO tmp_best_mbmode;
       PARTITION_INFO tmp_best_partition;
+      BEST_SEG_INFO bsi[VP9_SWITCHABLE_FILTERS];
       int pred_exists = 0;
       int uv_skippable;
+      if (is_comp_pred) {
+        if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
+          if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
+            continue;
+        if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
+          if (vp9_mode_order[mode_index].ref_frame != best_inter_ref_frame &&
+              vp9_mode_order[mode_index].second_ref_frame !=
+              best_inter_ref_frame)
+            continue;
+      }
 
       this_rd_thresh = (mbmi->ref_frame[0] == LAST_FRAME) ?
           cpi->rd_threshes[bsize][THR_NEWMV] :
@@ -2807,25 +3599,36 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
       xd->mode_info_context->mbmi.txfm_size = TX_4X4;
 
+      cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
       for (switchable_filter_index = 0;
            switchable_filter_index < VP9_SWITCHABLE_FILTERS;
            ++switchable_filter_index) {
-        int newbest;
+        int newbest, rs;
+        int64_t rs_rd;
         mbmi->interp_filter =
-        vp9_switchable_interp[switchable_filter_index];
+            vp9_switchable_interp[switchable_filter_index];
         vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
                      &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
-                     second_ref, INT64_MAX,
+                     second_ref,
+                     best_yrd,
                      &rate, &rate_y, &distortion,
-                     &skippable,
+                     &skippable, &total_sse,
                      (int)this_rd_thresh, seg_mvs,
+                     bsi, switchable_filter_index,
                      mi_row, mi_col);
-        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          const int rs = get_switchable_rate(cm, x);
-          tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
-        }
+
+        if (tmp_rd == INT64_MAX)
+          continue;
+        cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
+        rs = get_switchable_rate(cm, x);
+        rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+        cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
+            MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], tmp_rd + rs_rd);
+        if (cm->mcomp_filter_type == SWITCHABLE)
+          tmp_rd += rs_rd;
+
         newbest = (tmp_rd < tmp_best_rd);
         if (newbest) {
           tmp_best_filter = mbmi->interp_filter;
@@ -2834,19 +3637,34 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
             (mbmi->interp_filter == cm->mcomp_filter_type &&
              cm->mcomp_filter_type != SWITCHABLE)) {
-              tmp_best_rdu = tmp_rd;
-              tmp_best_rate = rate;
-              tmp_best_ratey = rate_y;
-              tmp_best_distortion = distortion;
-              tmp_best_skippable = skippable;
-              tmp_best_mbmode = *mbmi;
-              tmp_best_partition = *x->partition_info;
-              for (i = 0; i < 4; i++)
-                tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
-              pred_exists = 1;
+          tmp_best_rdu = tmp_rd;
+          tmp_best_rate = rate;
+          tmp_best_ratey = rate_y;
+          tmp_best_distortion = distortion;
+          tmp_best_sse = total_sse;
+          tmp_best_skippable = skippable;
+          tmp_best_mbmode = *mbmi;
+          tmp_best_partition = *x->partition_info;
+          for (i = 0; i < 4; i++)
+            tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
+          pred_exists = 1;
+          if (switchable_filter_index == 0 &&
+              cpi->sf.use_rd_breakout &&
+              best_rd < INT64_MAX) {
+            if (tmp_best_rdu / 2 > best_rd) {
+              // skip searching the other filters if the first is
+              // already substantially larger than the best so far
+              tmp_best_filter = mbmi->interp_filter;
+              tmp_best_rdu = INT64_MAX;
+              break;
             }
+          }
+        }
       }  // switchable_filter_index loop
 
+      if (tmp_best_rdu == INT64_MAX)
+        continue;
+
       mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
                              tmp_best_filter : cm->mcomp_filter_type);
       vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
@@ -2855,17 +3673,22 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         // switchable list (bilinear, 6-tap) is indicated at the frame level
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
                      &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
-                     second_ref, INT64_MAX,
+                     second_ref,
+                     best_yrd,
                      &rate, &rate_y, &distortion,
-                     &skippable,
+                     &skippable, &total_sse,
                      (int)this_rd_thresh, seg_mvs,
+                     bsi, 0,
                      mi_row, mi_col);
+        if (tmp_rd == INT64_MAX)
+          continue;
       } else {
         if (cpi->common.mcomp_filter_type == SWITCHABLE) {
           int rs = get_switchable_rate(cm, x);
           tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
         }
         tmp_rd = tmp_best_rdu;
+        total_sse = tmp_best_sse;
         rate = tmp_best_rate;
         rate_y = tmp_best_ratey;
         distortion = tmp_best_distortion;
@@ -2882,29 +3705,33 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       if (cpi->common.mcomp_filter_type == SWITCHABLE)
         rate2 += get_switchable_rate(cm, x);
 
-      // If even the 'Y' rd value of split is higher than best so far
-      // then dont bother looking at UV
-      vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
-                                      BLOCK_SIZE_SB8X8);
-      vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
-      super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
-                                &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4);
-      rate2 += rate_uv;
-      distortion2 += distortion_uv;
-      skippable = skippable && uv_skippable;
-
-      txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-      for (i = 0; i < NB_TXFM_MODES; ++i)
-        txfm_cache[i] = txfm_cache[ONLY_4X4];
-
       if (!mode_excluded) {
         if (is_comp_pred)
           mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
         else
           mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
       }
-
       compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred);
+
+      if (RDCOST(x->rdmult, x->rddiv, rate2, distortion2) <
+          best_rd) {
+        // If even the 'Y' rd value of split is higher than best so far
+        // then dont bother looking at UV
+        vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
+                                        BLOCK_SIZE_SB8X8);
+        vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
+        super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
+                                  &uv_skippable, &uv_sse,
+                                  BLOCK_SIZE_SB8X8, TX_4X4);
+        rate2 += rate_uv;
+        distortion2 += distortion_uv;
+        skippable = skippable && uv_skippable;
+        total_sse += uv_sse;
+
+        txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+        for (i = 0; i < NB_TXFM_MODES; ++i)
+          txfm_cache[i] = txfm_cache[ONLY_4X4];
+      }
     } else {
       compmode_cost = vp9_cost_bit(comp_mode_p,
                                    mbmi->ref_frame[1] > INTRA_FRAME);
@@ -2914,9 +3741,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   &rate_y, &distortion_y,
                                   &rate_uv, &distortion_uv,
                                   &mode_excluded, &disable_skip,
-                                  &tmp_best_filter, frame_mv[this_mode],
+                                  &tmp_best_filter, frame_mv,
                                   mi_row, mi_col,
-                                  single_newmv);
+                                  single_newmv, &total_sse, best_rd);
       if (this_rd == INT64_MAX)
         continue;
     }
@@ -2938,15 +3765,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       // because there are no non zero coefficients and make any
       // necessary adjustment for rate. Ignore if skip is coded at
       // segment level as the cost wont have been added in.
-      int mb_skip_allowed;
-
       // Is Mb level skip allowed (i.e. not coded at segment level).
-      mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
+      const int mb_skip_allowed = !vp9_segfeature_active(&xd->seg, segment_id,
+                                                         SEG_LVL_SKIP);
 
       if (skippable && bsize >= BLOCK_SIZE_SB8X8) {
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
-        // for best_yrd calculation
+        // for best yrd calculation
         rate_uv = 0;
 
         if (mb_skip_allowed) {
@@ -2954,17 +3780,37 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
           // Cost the skip mb case
           vp9_prob skip_prob =
-            vp9_get_pred_prob(cm, xd, PRED_MBSKIP);
+            vp9_get_pred_prob_mbskip(cm, xd);
 
           if (skip_prob) {
             prob_skip_cost = vp9_cost_bit(skip_prob, 1);
             rate2 += prob_skip_cost;
           }
         }
+      } else if (mb_skip_allowed && ref_frame != INTRA_FRAME &&
+                 !xd->lossless) {
+        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+          // Add in the cost of the no skip flag.
+          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+                                            0);
+          rate2 += prob_skip_cost;
+        } else {
+          // FIXME(rbultje) make this work for splitmv also
+          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+                                            1);
+          rate2 += prob_skip_cost;
+          distortion2 = total_sse;
+          assert(total_sse >= 0);
+          rate2 -= (rate_y + rate_uv);
+          rate_y = 0;
+          rate_uv = 0;
+          this_skip2 = 1;
+        }
       } else if (mb_skip_allowed) {
         // Add in the cost of the no skip flag.
-        int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
-                                                        PRED_MBSKIP), 0);
+        int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+                                          0);
         rate2 += prob_skip_cost;
       }
 
@@ -2972,23 +3818,28 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
     }
 
-#if 0
-    // Keep record of best intra distortion
-    if ((xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) &&
-        (this_rd < best_intra_rd)) {
+    // Keep record of best intra rd
+    if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME &&
+        is_intra_mode(xd->mode_info_context->mbmi.mode) &&
+        this_rd < best_intra_rd) {
       best_intra_rd = this_rd;
-      *returnintra = distortion2;
+      best_intra_mode = xd->mode_info_context->mbmi.mode;
+    }
+    // Keep record of best inter rd with single reference
+    if (xd->mode_info_context->mbmi.ref_frame[0] > INTRA_FRAME &&
+        xd->mode_info_context->mbmi.ref_frame[1] == NONE &&
+        !mode_excluded &&
+        this_rd < best_inter_rd) {
+      best_inter_rd = this_rd;
+      best_inter_ref_frame = ref_frame;
+      // best_inter_mode = xd->mode_info_context->mbmi.mode;
     }
-#endif
 
-    if (!disable_skip && mbmi->ref_frame[0] == INTRA_FRAME)
+    if (!disable_skip && mbmi->ref_frame[0] == INTRA_FRAME) {
       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-
-    if (this_rd < best_overall_rd) {
-      best_overall_rd = this_rd;
-      best_filter = tmp_best_filter;
-      best_mode = this_mode;
+      for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
+        best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
     }
 
     if (this_mode != I4X4_PRED && this_mode != SPLITMV) {
@@ -3007,6 +3858,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (this_rd < best_rd || x->skip) {
       if (!mode_excluded) {
         // Note index of best mode so far
+        const int qstep = xd->plane[0].dequant[1];
+
         best_mode_index = mode_index;
 
         if (ref_frame == INTRA_FRAME) {
@@ -3017,12 +3870,21 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         *returnrate = rate2;
         *returndistortion = distortion2;
         best_rd = this_rd;
+        best_yrd = best_rd -
+                   RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
         best_mbmode = *mbmi;
+        best_skip2 = this_skip2;
         best_partition = *x->partition_info;
 
         if (this_mode == I4X4_PRED || this_mode == SPLITMV)
           for (i = 0; i < 4; i++)
             best_bmodes[i] = xd->mode_info_context->bmi[i];
+
+        // TODO(debargha): enhance this test with a better distortion prediction
+        // based on qp, activity mask and history
+        if (cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE)
+          if (ref_frame > INTRA_FRAME && distortion2 * 4 < qstep * qstep)
+            early_term = 1;
       }
 #if 0
       // Testing this mode gave rise to an improvement in best error score.
@@ -3075,6 +3937,26 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
     }
 
+    /* keep record of best filter type */
+    if (!mode_excluded && !disable_skip && mbmi->ref_frame[0] != INTRA_FRAME &&
+        cm->mcomp_filter_type != BILINEAR) {
+      int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
+                              VP9_SWITCHABLE_FILTERS :
+                              vp9_switchable_interp_map[cm->mcomp_filter_type]];
+      for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+        int64_t adj_rd;
+        // In cases of poor prediction, filter_cache[] can contain really big
+        // values, which actually are bigger than this_rd itself. This can
+        // cause negative best_filter_rd[] values, which is obviously silly.
+        // Therefore, if filter_cache < ref, we do an adjusted calculation.
+        if (cpi->rd_filter_cache[i] >= ref)
+          adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
+        else  // FIXME(rbultje) do this for comppred also
+          adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref;
+        best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
+      }
+    }
+
     /* keep record of best txfm size */
     if (bsize < BLOCK_SIZE_SB32X32) {
       if (bsize < BLOCK_SIZE_MB16X16) {
@@ -3088,7 +3970,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       for (i = 0; i < NB_TXFM_MODES; i++) {
         int64_t adj_rd = INT64_MAX;
         if (this_mode != I4X4_PRED) {
-          adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];
+          adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->tx_mode];
         } else {
           adj_rd = this_rd;
         }
@@ -3098,9 +3980,41 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       }
     }
 
+    if (early_term)
+      break;
+
     if (x->skip && !mode_excluded)
       break;
   }
+  if (best_rd >= best_rd_so_far)
+    return INT64_MAX;
+
+  // If we used an estimate for the uv intra rd in the loop above...
+  if (cpi->sf.use_uv_intra_rd_estimate) {
+    // Do Intra UV best rd mode selection if best mode choice above was intra.
+    if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
+      TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
+                              &rate_uv_tokenonly[uv_tx_size],
+                              &dist_uv[uv_tx_size],
+                              &skip_uv[uv_tx_size],
+                              (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8
+                                                         : bsize);
+    }
+  }
+
+  // If indicated then mark the index of the chosen mode to be inspected at
+  // other block sizes.
+  if (bsize <= cpi->sf.unused_mode_skip_lvl) {
+    cpi->unused_mode_skip_mask = cpi->unused_mode_skip_mask &
+                                 (~((int64_t)1 << best_mode_index));
+  }
+
+  // If we are using reference masking and the set mask flag is set then
+  // create the reference frame mask.
+  if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
+    cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame);
+
   // Flag all modes that have a distortion thats > 2x the best we found at
   // this level.
   for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
@@ -3130,26 +4044,21 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
          (best_mbmode.ref_frame[0] == INTRA_FRAME));
 
-  // Accumulate filter usage stats
-  // TODO(agrange): Use RD criteria to select interpolation filter mode.
-  if (is_inter_mode(best_mode))
-    ++cpi->best_switchable_interp_count[vp9_switchable_interp_map[best_filter]];
-
   // Updating rd_thresh_freq_fact[] here means that the differnt
   // partition/block sizes are handled independently based on the best
   // choice for the current partition. It may well be better to keep a scaled
   // best rd so far value and update rd_thresh_freq_fact based on the mode/size
   // combination that wins out.
-  if (cpi->sf.adpative_rd_thresh) {
+  if (cpi->sf.adaptive_rd_thresh) {
     for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
       if (mode_index == best_mode_index) {
         cpi->rd_thresh_freq_fact[bsize][mode_index] = BASE_RD_THRESH_FREQ_FACT;
       } else {
         cpi->rd_thresh_freq_fact[bsize][mode_index] += MAX_RD_THRESH_FREQ_INC;
         if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
-            (cpi->sf.adpative_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) {
+            (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) {
           cpi->rd_thresh_freq_fact[bsize][mode_index] =
-            cpi->sf.adpative_rd_thresh * MAX_RD_THRESH_FREQ_FACT;
+            cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT;
         }
       }
     }
@@ -3170,36 +4079,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   }
 #endif
 
-  // This code forces Altref,0,0 and skip for the frame that overlays a
-  // an alrtef unless Altref is filtered. However, this is unsafe if
-  // segment level coding of ref frame is enabled for this segment.
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-      cpi->is_src_frame_alt_ref &&
-      (cpi->oxcf.arnr_max_frames == 0) &&
-      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame[0] != ALTREF_FRAME)
-      && bsize >= BLOCK_SIZE_SB8X8) {
-    mbmi->mode = ZEROMV;
-    mbmi->ref_frame[0] = ALTREF_FRAME;
-    mbmi->ref_frame[1] = NONE;
-    mbmi->mv[0].as_int = 0;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->mb_skip_coeff = 1;
-    if (cm->txfm_mode == TX_MODE_SELECT) {
-      if (bsize >= BLOCK_SIZE_SB32X32)
-        mbmi->txfm_size = TX_32X32;
-      else if (bsize >= BLOCK_SIZE_MB16X16)
-        mbmi->txfm_size = TX_16X16;
-      else
-        mbmi->txfm_size = TX_8X8;
-    }
-
-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
-    vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
-    goto end;
-  }
-
   // macroblock modes
   *mbmi = best_mbmode;
+  x->skip |= best_skip2;
   if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
       best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
     for (i = 0; i < 4; i++)
@@ -3219,8 +4101,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     *x->partition_info = best_partition;
 
-    mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int;
-    mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int;
+    mbmi->mv[0].as_int = xd->mode_info_context->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = xd->mode_info_context->bmi[3].as_mv[1].as_int;
   }
 
   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
@@ -3231,6 +4113,19 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (!x->skip) {
+    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+      if (best_filter_rd[i] == INT64_MAX)
+        best_filter_diff[i] = 0;
+      else
+        best_filter_diff[i] = best_rd - best_filter_rd[i];
+    }
+    if (cm->mcomp_filter_type == SWITCHABLE)
+      assert(best_filter_diff[VP9_SWITCHABLE_FILTERS] == 0);
+  } else {
+    vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff));
+  }
+
+  if (!x->skip) {
     for (i = 0; i < NB_TXFM_MODES; i++) {
       if (best_txfm_rd[i] == INT64_MAX)
         best_txfm_diff[i] = 0;
@@ -3241,7 +4136,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
   }
 
- end:
   set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
                     scale_factor);
   store_coding_context(x, ctx, best_mode_index,
@@ -3249,7 +4143,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
                                       mbmi->ref_frame[1]][0],
-                       best_pred_diff, best_txfm_diff);
+                       best_pred_diff, best_txfm_diff, best_filter_diff);
 
   return best_rd;
 }
diff --git a/libvpx/vp9/encoder/vp9_rdopt.h b/libvpx/vp9/encoder/vp9_rdopt.h
index dcf5d00..7c84b48 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/libvpx/vp9/encoder/vp9_rdopt.h
@@ -15,18 +15,20 @@
 #define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
 #define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
 
+#define QIDX_SKIP_THRESH     115
+
 void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                               int *r, int *d, BLOCK_SIZE_TYPE bsize,
-                               PICK_MODE_CONTEXT *ctx);
+                               int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
-                                  int *r, int *d, BLOCK_SIZE_TYPE bsize,
-                                  PICK_MODE_CONTEXT *ctx);
+                                  int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
+                                  PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
 void vp9_init_me_luts();
 
diff --git a/libvpx/vp9/encoder/vp9_sad_c.c b/libvpx/vp9/encoder/vp9_sad_c.c
index 6b1ba49..42ddb21 100644
--- a/libvpx/vp9/encoder/vp9_sad_c.c
+++ b/libvpx/vp9/encoder/vp9_sad_c.c
@@ -11,25 +11,43 @@
 
 #include <stdlib.h>
 #include "vp9/common/vp9_sadmxn.h"
+#include "vp9/encoder/vp9_variance.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "./vp9_rtcd.h"
 
-unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,
-                            int  src_stride,
-                            const uint8_t *ref_ptr,
-                            int  ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);
-}
-
-unsigned int vp9_sad64x32_c(const uint8_t *src_ptr,
-                            int  src_stride,
-                            const uint8_t *ref_ptr,
-                            int  ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 32);
-}
+#define sad_mxn_func(m, n) \
+unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \
+                                  int  src_stride, \
+                                  const uint8_t *ref_ptr, \
+                                  int  ref_stride, \
+                                  unsigned int max_sad) { \
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+} \
+unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, \
+                                      int  src_stride, \
+                                      const uint8_t *ref_ptr, \
+                                      int  ref_stride, \
+                                      const uint8_t *second_pred, \
+                                      unsigned int max_sad) { \
+  uint8_t comp_pred[m * n]; \
+  comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
+  return sad_mx_n_c(src_ptr, src_stride, comp_pred, m, m, n); \
+}
+
+sad_mxn_func(64, 64)
+sad_mxn_func(64, 32)
+sad_mxn_func(32, 64)
+sad_mxn_func(32, 32)
+sad_mxn_func(32, 16)
+sad_mxn_func(16, 32)
+sad_mxn_func(16, 16)
+sad_mxn_func(16, 8)
+sad_mxn_func(8, 16)
+sad_mxn_func(8, 8)
+sad_mxn_func(8, 4)
+sad_mxn_func(4, 8)
+sad_mxn_func(4, 4)
 
 void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
@@ -46,14 +64,6 @@ void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
                               ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-unsigned int vp9_sad32x64_c(const uint8_t *src_ptr,
-                            int  src_stride,
-                            const uint8_t *ref_ptr,
-                            int  ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 64);
-}
-
 void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
                        const uint8_t* const ref_ptr[],
@@ -69,22 +79,6 @@ void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
                               ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-unsigned int vp9_sad32x32_c(const uint8_t *src_ptr,
-                            int  src_stride,
-                            const uint8_t *ref_ptr,
-                            int  ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
-}
-
-unsigned int vp9_sad32x16_c(const uint8_t *src_ptr,
-                            int   src_stride,
-                            const uint8_t *ref_ptr,
-                            int   ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 16);
-}
-
 void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
                        const uint8_t* const ref_ptr[],
@@ -100,14 +94,6 @@ void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
                               ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-unsigned int vp9_sad16x32_c(const uint8_t *src_ptr,
-                            int   src_stride,
-                            const uint8_t *ref_ptr,
-                            int   ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 32);
-}
-
 void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
                        const uint8_t* const ref_ptr[],
@@ -123,63 +109,6 @@ void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
                               ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-unsigned int vp9_sad16x16_c(const uint8_t *src_ptr,
-                            int  src_stride,
-                            const uint8_t *ref_ptr,
-                            int  ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
-}
-
-unsigned int vp9_sad8x8_c(const uint8_t *src_ptr,
-                          int  src_stride,
-                          const uint8_t *ref_ptr,
-                          int  ref_stride,
-                          unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
-}
-
-
-unsigned int vp9_sad16x8_c(const uint8_t *src_ptr,
-                           int  src_stride,
-                           const uint8_t *ref_ptr,
-                           int  ref_stride,
-                           unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
-}
-
-unsigned int vp9_sad8x16_c(const uint8_t *src_ptr,
-                           int  src_stride,
-                           const uint8_t *ref_ptr,
-                           int  ref_stride,
-                           unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
-}
-
-unsigned int vp9_sad8x4_c(const uint8_t *src_ptr,
-                          int src_stride,
-                          const uint8_t *ref_ptr,
-                          int ref_stride,
-                          unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 4);
-}
-
-unsigned int vp9_sad4x8_c(const uint8_t *src_ptr,
-                          int src_stride,
-                          const uint8_t *ref_ptr,
-                          int ref_stride,
-                          unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 8);
-}
-
-unsigned int vp9_sad4x4_c(const uint8_t *src_ptr,
-                          int  src_stride,
-                          const uint8_t *ref_ptr,
-                          int  ref_stride,
-                          unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
-}
-
 void vp9_sad64x64x3_c(const uint8_t *src_ptr,
                       int  src_stride,
                       const uint8_t *ref_ptr,
diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c
index fe995ad..ef84cc5 100644
--- a/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/libvpx/vp9/encoder/vp9_segmentation.c
@@ -18,14 +18,14 @@
 void vp9_enable_segmentation(VP9_PTR ptr) {
   VP9_COMP *cpi = (VP9_COMP *)ptr;
 
-  cpi->mb.e_mbd.segmentation_enabled = 1;
-  cpi->mb.e_mbd.update_mb_segmentation_map = 1;
-  cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+  cpi->mb.e_mbd.seg.enabled = 1;
+  cpi->mb.e_mbd.seg.update_map = 1;
+  cpi->mb.e_mbd.seg.update_data = 1;
 }
 
 void vp9_disable_segmentation(VP9_PTR ptr) {
   VP9_COMP *cpi = (VP9_COMP *)ptr;
-  cpi->mb.e_mbd.segmentation_enabled = 0;
+  cpi->mb.e_mbd.seg.enabled = 0;
 }
 
 void vp9_set_segmentation_map(VP9_PTR ptr,
@@ -37,8 +37,8 @@ void vp9_set_segmentation_map(VP9_PTR ptr,
              (cpi->common.mi_rows * cpi->common.mi_cols));
 
   // Signal that the map should be updated.
-  cpi->mb.e_mbd.update_mb_segmentation_map = 1;
-  cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+  cpi->mb.e_mbd.seg.update_map = 1;
+  cpi->mb.e_mbd.seg.update_data = 1;
 }
 
 void vp9_set_segment_data(VP9_PTR ptr,
@@ -46,10 +46,10 @@ void vp9_set_segment_data(VP9_PTR ptr,
                           unsigned char abs_delta) {
   VP9_COMP *cpi = (VP9_COMP *)(ptr);
 
-  cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta;
+  cpi->mb.e_mbd.seg.abs_delta = abs_delta;
 
-  vpx_memcpy(cpi->mb.e_mbd.segment_feature_data, feature_data,
-             sizeof(cpi->mb.e_mbd.segment_feature_data));
+  vpx_memcpy(cpi->mb.e_mbd.seg.feature_data, feature_data,
+             sizeof(cpi->mb.e_mbd.seg.feature_data));
 
   // TBD ?? Set the feature mask
   // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,
@@ -115,8 +115,7 @@ static int cost_segmap(MACROBLOCKD *xd, int *segcounts, vp9_prob *probs) {
   return cost;
 }
 
-static void count_segs(VP9_COMP *cpi,
-                       MODE_INFO *mi,
+static void count_segs(VP9_COMP *cpi, MODE_INFO *mi,
                        int *no_pred_segcounts,
                        int (*temporal_predictor_count)[2],
                        int *t_unpred_seg_counts,
@@ -137,20 +136,19 @@ static void count_segs(VP9_COMP *cpi,
 
   // Temporal prediction not allowed on key frames
   if (cm->frame_type != KEY_FRAME) {
+    const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
     // Test to see if the segment id matches the predicted value.
-    const int pred_seg_id = vp9_get_pred_mi_segid(cm, mi->mbmi.sb_type,
-                                                  mi_row, mi_col);
-    const int seg_predicted = (segment_id == pred_seg_id);
-
-    // Get the segment id prediction context
-    const int pred_context = vp9_get_pred_context(cm, xd, PRED_SEG_ID);
+    const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
+                                                   bsize, mi_row, mi_col);
+    const int pred_flag = pred_segment_id == segment_id;
+    const int pred_context = vp9_get_pred_context_seg_id(xd);
 
     // Store the prediction status for this mb and update counts
     // as appropriate
-    vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
-    temporal_predictor_count[pred_context][seg_predicted]++;
+    vp9_set_pred_flag_seg_id(cm, bsize, mi_row, mi_col, pred_flag);
+    temporal_predictor_count[pred_context][pred_flag]++;
 
-    if (!seg_predicted)
+    if (!pred_flag)
       // Update the "unpredicted" segment count
       t_unpred_seg_counts[segment_id]++;
   }
@@ -218,15 +216,14 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   int no_pred_cost;
   int t_pred_cost = INT_MAX;
 
-  int i;
-  int tile_col, mi_row, mi_col;
+  int i, tile_col, mi_row, mi_col;
 
   int temporal_predictor_count[PREDICTION_PROBS][2];
-  int no_pred_segcounts[MAX_MB_SEGMENTS];
-  int t_unpred_seg_counts[MAX_MB_SEGMENTS];
+  int no_pred_segcounts[MAX_SEGMENTS];
+  int t_unpred_seg_counts[MAX_SEGMENTS];
 
-  vp9_prob no_pred_tree[MB_SEG_TREE_PROBS];
-  vp9_prob t_pred_tree[MB_SEG_TREE_PROBS];
+  vp9_prob no_pred_tree[SEG_TREE_PROBS];
+  vp9_prob t_pred_tree[SEG_TREE_PROBS];
   vp9_prob t_nopred_prob[PREDICTION_PROBS];
 
   const int mis = cm->mode_info_stride;
@@ -234,8 +231,8 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
 
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities
-  vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
-  vpx_memset(cm->segment_pred_probs, 255, sizeof(cm->segment_pred_probs));
+  vpx_memset(xd->seg.tree_probs, 255, sizeof(xd->seg.tree_probs));
+  vpx_memset(xd->seg.pred_probs, 255, sizeof(xd->seg.pred_probs));
 
   vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));
   vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));
@@ -243,18 +240,16 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
 
   // First of all generate stats regarding how well the last segment map
   // predicts this one
-  for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
+  for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
     vp9_get_tile_col_offsets(cm, tile_col);
     mi_ptr = cm->mi + cm->cur_tile_mi_col_start;
     for (mi_row = 0; mi_row < cm->mi_rows;
          mi_row += 8, mi_ptr += 8 * mis) {
       mi = mi_ptr;
-      for (mi_col = cm->cur_tile_mi_col_start;
-           mi_col < cm->cur_tile_mi_col_end;
-           mi_col += 8, mi += 8) {
+      for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+           mi_col += 8, mi += 8)
         count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count,
                       t_unpred_seg_counts, mi_row, mi_col, BLOCK_SIZE_SB64X64);
-      }
     }
   }
 
@@ -285,11 +280,11 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
 
   // Now choose which coding method to use.
   if (t_pred_cost < no_pred_cost) {
-    cm->temporal_update = 1;
-    vpx_memcpy(xd->mb_segment_tree_probs, t_pred_tree, sizeof(t_pred_tree));
-    vpx_memcpy(cm->segment_pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
+    xd->seg.temporal_update = 1;
+    vpx_memcpy(xd->seg.tree_probs, t_pred_tree, sizeof(t_pred_tree));
+    vpx_memcpy(xd->seg.pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
   } else {
-    cm->temporal_update = 0;
-    vpx_memcpy(xd->mb_segment_tree_probs, no_pred_tree, sizeof(no_pred_tree));
+    xd->seg.temporal_update = 0;
+    vpx_memcpy(xd->seg.tree_probs, no_pred_tree, sizeof(no_pred_tree));
   }
 }
diff --git a/libvpx/vp9/encoder/vp9_ssim.c b/libvpx/vp9/encoder/vp9_ssim.c
index 363ed84..c155516 100644
--- a/libvpx/vp9/encoder/vp9_ssim.c
+++ b/libvpx/vp9/encoder/vp9_ssim.c
@@ -88,8 +88,9 @@ double vp9_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1,
   double ssim_total = 0;
 
   // sample point start with each 4x4 location
-  for (i = 0; i < height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
-    for (j = 0; j < width - 8; j += 4) {
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
       double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
       ssim_total += v;
       samples++;
@@ -104,16 +105,16 @@ double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
   double ssimv;
 
   a = vp9_ssim2(source->y_buffer, dest->y_buffer,
-                source->y_stride, dest->y_stride, source->y_width,
-                source->y_height);
+                source->y_stride, dest->y_stride,
+                source->y_crop_width, source->y_crop_height);
 
   b = vp9_ssim2(source->u_buffer, dest->u_buffer,
-                source->uv_stride, dest->uv_stride, source->uv_width,
-                source->uv_height);
+                source->uv_stride, dest->uv_stride,
+                source->uv_crop_width, source->uv_crop_height);
 
   c = vp9_ssim2(source->v_buffer, dest->v_buffer,
-                source->uv_stride, dest->uv_stride, source->uv_width,
-                source->uv_height);
+                source->uv_stride, dest->uv_stride,
+                source->uv_crop_width, source->uv_crop_height);
 
   ssimv = a * .8 + .1 * (b + c);
 
@@ -128,16 +129,16 @@ double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
   double a, b, c;
 
   a = vp9_ssim2(source->y_buffer, dest->y_buffer,
-                source->y_stride, dest->y_stride, source->y_width,
-                source->y_height);
+                source->y_stride, dest->y_stride,
+                source->y_crop_width, source->y_crop_height);
 
   b = vp9_ssim2(source->u_buffer, dest->u_buffer,
-                source->uv_stride, dest->uv_stride, source->uv_width,
-                source->uv_height);
+                source->uv_stride, dest->uv_stride,
+                source->uv_crop_width, source->uv_crop_height);
 
   c = vp9_ssim2(source->v_buffer, dest->v_buffer,
-                source->uv_stride, dest->uv_stride, source->uv_width,
-                source->uv_height);
+                source->uv_stride, dest->uv_stride,
+                source->uv_crop_width, source->uv_crop_height);
   *ssim_y = a;
   *ssim_u = b;
   *ssim_v = c;
diff --git a/libvpx/vp9/encoder/vp9_subexp.c b/libvpx/vp9/encoder/vp9_subexp.c
new file mode 100644
index 0000000..667b801
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_subexp.c
@@ -0,0 +1,236 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+
+#include "vp9/encoder/vp9_boolhuff.h"
+#include "vp9/encoder/vp9_treewriter.h"
+
+#define vp9_cost_upd  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
+#define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
+
+static int update_bits[255];
+
+static int count_uniform(int v, int n) {
+  int l = get_unsigned_bits(n);
+  int m;
+  if (l == 0) return 0;
+  m = (1 << l) - n;
+  if (v < m)
+    return l - 1;
+  else
+    return l;
+}
+
+static int split_index(int i, int n, int modulus) {
+  int max1 = (n - 1 - modulus / 2) / modulus + 1;
+  if (i % modulus == modulus / 2)
+    i = i / modulus;
+  else
+    i = max1 + i - (i + modulus - modulus / 2) / modulus;
+  return i;
+}
+
+static int recenter_nonneg(int v, int m) {
+  if (v > (m << 1))
+    return v;
+  else if (v >= m)
+    return ((v - m) << 1);
+  else
+    return ((m - v) << 1) - 1;
+}
+
+static int remap_prob(int v, int m) {
+  int i;
+  static const int map_table[MAX_PROB - 1] = {
+    // generated by:
+    //   map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM);
+     20,  21,  22,  23,  24,  25,   0,  26,  27,  28,  29,  30,  31,  32,  33,
+     34,  35,  36,  37,   1,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+     48,  49,   2,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
+      3,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,   4,  74,
+     75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,   5,  86,  87,  88,
+     89,  90,  91,  92,  93,  94,  95,  96,  97,   6,  98,  99, 100, 101, 102,
+    103, 104, 105, 106, 107, 108, 109,   7, 110, 111, 112, 113, 114, 115, 116,
+    117, 118, 119, 120, 121,   8, 122, 123, 124, 125, 126, 127, 128, 129, 130,
+    131, 132, 133,   9, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
+    145,  10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,  11,
+    158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,  12, 170, 171,
+    172, 173, 174, 175, 176, 177, 178, 179, 180, 181,  13, 182, 183, 184, 185,
+    186, 187, 188, 189, 190, 191, 192, 193,  14, 194, 195, 196, 197, 198, 199,
+    200, 201, 202, 203, 204, 205,  15, 206, 207, 208, 209, 210, 211, 212, 213,
+    214, 215, 216, 217,  16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
+    228, 229,  17, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+     18, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,  19,
+  };
+  v--;
+  m--;
+  if ((m << 1) <= MAX_PROB)
+    i = recenter_nonneg(v, m) - 1;
+  else
+    i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1;
+
+  i = map_table[i];
+  return i;
+}
+
+static int count_term_subexp(int word, int k, int num_syms) {
+  int count = 0;
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (num_syms <= mk + 3 * a) {
+      count += count_uniform(word - mk, num_syms - mk);
+      break;
+    } else {
+      int t = (word >= mk + a);
+      count++;
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        count += b;
+        break;
+      }
+    }
+  }
+  return count;
+}
+
+static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
+  int delp = remap_prob(newp, oldp);
+  return update_bits[delp] * 256;
+}
+
+static void encode_uniform(vp9_writer *w, int v, int n) {
+  int l = get_unsigned_bits(n);
+  int m;
+  if (l == 0)
+    return;
+  m = (1 << l) - n;
+  if (v < m) {
+    vp9_write_literal(w, v, l - 1);
+  } else {
+    vp9_write_literal(w, m + ((v - m) >> 1), l - 1);
+    vp9_write_literal(w, (v - m) & 1, 1);
+  }
+}
+
+static void encode_term_subexp(vp9_writer *w, int word, int k, int num_syms) {
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (num_syms <= mk + 3 * a) {
+      encode_uniform(w, word - mk, num_syms - mk);
+      break;
+    } else {
+      int t = (word >= mk + a);
+      vp9_write_literal(w, t, 1);
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        vp9_write_literal(w, word - mk, b);
+        break;
+      }
+    }
+  }
+}
+
+void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp) {
+  const int delp = remap_prob(newp, oldp);
+  encode_term_subexp(w, delp, SUBEXP_PARAM, 255);
+}
+
+void vp9_compute_update_table() {
+  int i;
+  for (i = 0; i < 254; i++)
+    update_bits[i] = count_term_subexp(i, SUBEXP_PARAM, 255);
+}
+
+int vp9_prob_diff_update_savings_search(const unsigned int *ct,
+                                        vp9_prob oldp, vp9_prob *bestp,
+                                        vp9_prob upd) {
+  const int old_b = cost_branch256(ct, oldp);
+  int bestsavings = 0;
+  vp9_prob newp, bestnewp = oldp;
+  const int step = *bestp > oldp ? -1 : 1;
+
+  for (newp = *bestp; newp != oldp; newp += step) {
+    const int new_b = cost_branch256(ct, newp);
+    const int update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;
+    const int savings = old_b - new_b - update_b;
+    if (savings > bestsavings) {
+      bestsavings = savings;
+      bestnewp = newp;
+    }
+  }
+  *bestp = bestnewp;
+  return bestsavings;
+}
+
+int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                              const vp9_prob *oldp,
+                                              vp9_prob *bestp,
+                                              vp9_prob upd,
+                                              int b, int r) {
+  int i, old_b, new_b, update_b, savings, bestsavings, step;
+  int newp;
+  vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
+  vp9_model_to_full_probs(oldp, oldplist);
+  vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+  for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
+    old_b += cost_branch256(ct + 2 * i, oldplist[i]);
+  old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
+
+  bestsavings = 0;
+  bestnewp = oldp[PIVOT_NODE];
+
+  step = (*bestp > oldp[PIVOT_NODE] ? -1 : 1);
+
+  for (newp = *bestp; newp != oldp[PIVOT_NODE]; newp += step) {
+    if (newp < 1 || newp > 255)
+      continue;
+    newplist[PIVOT_NODE] = newp;
+    vp9_model_to_full_probs(newplist, newplist);
+    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+      new_b += cost_branch256(ct + 2 * i, newplist[i]);
+    new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
+    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
+        vp9_cost_upd256;
+    savings = old_b - new_b - update_b;
+    if (savings > bestsavings) {
+      bestsavings = savings;
+      bestnewp = newp;
+    }
+  }
+  *bestp = bestnewp;
+  return bestsavings;
+}
+
+void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
+                               vp9_prob upd, unsigned int *ct) {
+  vp9_prob newp = get_binary_prob(ct[0], ct[1]);
+  const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
+                                                          upd);
+  assert(newp >= 1);
+  if (savings > 0) {
+    vp9_write(w, 1, upd);
+    vp9_write_prob_diff_update(w, newp, *oldp);
+    *oldp = newp;
+  } else {
+    vp9_write(w, 0, upd);
+  }
+}
diff --git a/libvpx/vp9/encoder/vp9_subexp.h b/libvpx/vp9/encoder/vp9_subexp.h
new file mode 100644
index 0000000..7acdaf6
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_subexp.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_DECODER_VP9_SUBEXP_H_
+#define VP9_DECODER_VP9_SUBEXP_H_
+
+void vp9_compute_update_table();
+
+
+void vp9_write_prob_diff_update(vp9_writer *w,
+                                vp9_prob newp, vp9_prob oldp);
+
+void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
+                               vp9_prob upd, unsigned int *ct);
+
+int vp9_prob_diff_update_savings_search(const unsigned int *ct,
+                                        vp9_prob oldp, vp9_prob *bestp,
+                                        vp9_prob upd);
+
+
+int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                              const vp9_prob *oldp,
+                                              vp9_prob *bestp,
+                                              vp9_prob upd,
+                                              int b, int r);
+
+#endif  // VP9_DECODER_VP9_SUBEXP_H_
diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c
index 47792fc..821b7c6 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -51,25 +51,25 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                             &xd->scale_factor[which_mv],
                             16, 16,
                             which_mv,
-                            &xd->subpix);
+                            &xd->subpix, MV_PRECISION_Q3);
 
   stride = (stride + 1) >> 1;
 
-  vp9_build_inter_predictor_q4(u_mb_ptr, stride,
-                               &pred[256], 8,
-                               &mv,
-                               &xd->scale_factor_uv[which_mv],
-                               8, 8,
-                               which_mv,
-                               &xd->subpix);
-
-  vp9_build_inter_predictor_q4(v_mb_ptr, stride,
-                               &pred[320], 8,
-                               &mv,
-                               &xd->scale_factor_uv[which_mv],
-                               8, 8,
-                               which_mv,
-                               &xd->subpix);
+  vp9_build_inter_predictor(u_mb_ptr, stride,
+                            &pred[256], 8,
+                            &mv,
+                            &xd->scale_factor[which_mv],
+                            8, 8,
+                            which_mv,
+                            &xd->subpix, MV_PRECISION_Q4);
+
+  vp9_build_inter_predictor(v_mb_ptr, stride,
+                            &pred[320], 8,
+                            &mv,
+                            &xd->scale_factor[which_mv],
+                            8, 8,
+                            which_mv,
+                            &xd->subpix, MV_PRECISION_Q4);
 }
 
 void vp9_temporal_filter_apply_c(uint8_t *frame1,
@@ -148,9 +148,10 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
 
   // Further step/diamond searches as necessary
   if (cpi->speed < 8)
-    step_param = cpi->sf.first_step + ((cpi->speed > 5) ? 1 : 0);
+    step_param = cpi->sf.reduce_first_step_size + ((cpi->speed > 5) ? 1 : 0);
   else
-    step_param = cpi->sf.first_step + 2;
+    step_param = cpi->sf.reduce_first_step_size + 2;
+  step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
 
   /*cpi->sf.search_method == HEX*/
   // TODO Check that the 16x16 vf & sdf are selected here
@@ -442,7 +443,6 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
       cm->yv12_fb[cm->new_fb_idx].y_crop_width,
       cm->yv12_fb[cm->new_fb_idx].y_crop_height,
       cm->width, cm->height);
-  cpi->mb.e_mbd.scale_factor_uv[0] = cpi->mb.e_mbd.scale_factor[0];
 
   // Setup frame pointers, NULL indicates frame not included in filter
   vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));
diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c
index 0a290e1..4b9c6c8 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/libvpx/vp9/encoder/vp9_tokenize.c
@@ -90,8 +90,6 @@ static void fill_value_tokens() {
   vp9_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
 }
 
-extern const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);
-
 struct tokenize_b_args {
   VP9_COMP *cpi;
   MACROBLOCKD *xd;
@@ -106,7 +104,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
   VP9_COMP *cpi = args->cpi;
   MACROBLOCKD *xd = args->xd;
   TOKENEXTRA **tp = args->tp;
-  PLANE_TYPE type = plane ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
   TX_SIZE tx_size = ss_txfrm_size / 2;
   int dry_run = args->dry_run;
 
@@ -115,6 +112,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
   int c = 0, rc = 0;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   const int eob = xd->plane[plane].eobs[block];
+  const PLANE_TYPE type = xd->plane[plane].plane_type;
   const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
   const BLOCK_SIZE_TYPE sb_type = (mbmi->sb_type < BLOCK_SIZE_SB8X8) ?
                                    BLOCK_SIZE_SB8X8 : mbmi->sb_type;
@@ -125,56 +123,42 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
   const int loff = (off >> mod) << tx_size;
   ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff;
   ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff;
-  int seg_eob, default_eob, pad;
+  int seg_eob;
   const int segment_id = mbmi->segment_id;
-  const int *scan, *nb;
+  const int16_t *scan, *nb;
   vp9_coeff_count *counts;
   vp9_coeff_probs_model *coef_probs;
   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
   ENTROPY_CONTEXT above_ec, left_ec;
   uint8_t token_cache[1024];
-  TX_TYPE tx_type = DCT_DCT;
-  const uint8_t * band_translate;
+  const uint8_t *band_translate;
   assert((!type && !plane) || (type && plane));
 
   counts = cpi->coef_counts[tx_size];
   coef_probs = cpi->common.fc.coef_probs[tx_size];
   switch (tx_size) {
     default:
-    case TX_4X4: {
-      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_4x4(xd, block) : DCT_DCT;
+    case TX_4X4:
       above_ec = A[0] != 0;
       left_ec = L[0] != 0;
       seg_eob = 16;
-      scan = get_scan_4x4(tx_type);
+      scan = get_scan_4x4(get_tx_type_4x4(type, xd, block));
       band_translate = vp9_coefband_trans_4x4;
       break;
-    }
-    case TX_8X8: {
-      const int sz = 1 + b_width_log2(sb_type);
-      const int x = block & ((1 << sz) - 1), y = block - x;
-      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
+    case TX_8X8:
       above_ec = (A[0] + A[1]) != 0;
       left_ec = (L[0] + L[1]) != 0;
       seg_eob = 64;
-      scan = get_scan_8x8(tx_type);
+      scan = get_scan_8x8(get_tx_type_8x8(type, xd));
       band_translate = vp9_coefband_trans_8x8plus;
       break;
-    }
-    case TX_16X16: {
-      const int sz = 2 + b_width_log2(sb_type);
-      const int x = block & ((1 << sz) - 1), y = block - x;
-      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
+    case TX_16X16:
       above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
       seg_eob = 256;
-      scan = get_scan_16x16(tx_type);
+      scan = get_scan_16x16(get_tx_type_16x16(type, xd));
       band_translate = vp9_coefband_trans_8x8plus;
       break;
-    }
     case TX_32X32:
       above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
@@ -185,10 +169,9 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
   }
 
   pt = combine_entropy_contexts(above_ec, left_ec);
-  nb = vp9_get_coef_neighbors_handle(scan, &pad);
-  default_eob = seg_eob;
+  nb = vp9_get_coef_neighbors_handle(scan);
 
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
+  if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP))
     seg_eob = 0;
 
   c = 0;
@@ -198,7 +181,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
     int v = 0;
     rc = scan[c];
     if (c)
-      pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
+      pt = get_coef_context(nb, token_cache, c);
     if (c < eob) {
       v = qcoeff_ptr[rc];
       assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);
@@ -213,21 +196,12 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
     t->context_tree = coef_probs[type][ref][band][pt];
     t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0);
 
-#if CONFIG_BALANCED_COEFTREE
-    assert(token <= ZERO_TOKEN ||
-           vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
-#else
     assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
-#endif
 
     if (!dry_run) {
       ++counts[type][ref][band][pt][token];
-#if CONFIG_BALANCED_COEFTREE
-      if (!t->skip_eob_node && token > ZERO_TOKEN)
-#else
       if (!t->skip_eob_node)
-#endif
-        ++cpi->common.fc.eob_branch_counts[tx_size][type][ref][band][pt];
+        ++cpi->common.counts.eob_branch[tx_size][type][ref][band][pt];
     }
     token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++t;
@@ -263,8 +237,7 @@ int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
 int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
   int result = 1;
   struct is_skippable_args args = {xd, &result};
-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     is_skippable, &args);
+  foreach_transformed_block_in_plane(xd, bsize, 0, is_skippable, &args);
   return result;
 }
 
@@ -275,26 +248,22 @@ int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
   return result;
 }
 
-void vp9_tokenize_sb(VP9_COMP *cpi,
-                     MACROBLOCKD *xd,
-                     TOKENEXTRA **t,
-                     int dry_run, BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON * const cm = &cpi->common;
-  MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
+void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
+                     BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   TOKENEXTRA *t_backup = *t;
-  const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP);
-  const int segment_id = mbmi->segment_id;
-  const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
+  const int mb_skip_context = vp9_get_pred_context_mbskip(xd);
+  const int skip_inc = !vp9_segfeature_active(&xd->seg, mbmi->segment_id,
+                                              SEG_LVL_SKIP);
   const TX_SIZE txfm_size = mbmi->txfm_size;
-  struct tokenize_b_args arg = {
-    cpi, xd, t, txfm_size, dry_run
-  };
+  struct tokenize_b_args arg = { cpi, xd, t, txfm_size, dry_run };
 
   mbmi->mb_skip_coeff = vp9_sb_is_skippable(xd, bsize);
-
   if (mbmi->mb_skip_coeff) {
     if (!dry_run)
-      cm->fc.mbskip_count[mb_skip_context][1] += skip_inc;
+      cm->counts.mbskip[mb_skip_context][1] += skip_inc;
     vp9_reset_sb_tokens_context(xd, bsize);
     if (dry_run)
       *t = t_backup;
@@ -302,7 +271,7 @@ void vp9_tokenize_sb(VP9_COMP *cpi,
   }
 
   if (!dry_run)
-    cm->fc.mbskip_count[mb_skip_context][0] += skip_inc;
+    cm->counts.mbskip[mb_skip_context][0] += skip_inc;
 
   foreach_transformed_block(xd, bsize, tokenize_b, &arg);
 
diff --git a/libvpx/vp9/encoder/vp9_tokenize.h b/libvpx/vp9/encoder/vp9_tokenize.h
index e7f90c9..bc7d935 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/libvpx/vp9/encoder/vp9_tokenize.h
@@ -36,8 +36,8 @@ int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
 int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
 struct VP9_COMP;
 
-void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
-                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE_TYPE bsize);
+void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
+                     BLOCK_SIZE_TYPE bsize);
 
 #ifdef ENTROPY_STATS
 void init_context_counters();
diff --git a/libvpx/vp9/encoder/vp9_variance.h b/libvpx/vp9/encoder/vp9_variance.h
index 38808d7..6e686d6 100644
--- a/libvpx/vp9/encoder/vp9_variance.h
+++ b/libvpx/vp9/encoder/vp9_variance.h
@@ -20,6 +20,13 @@ typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int ref_stride,
                                     unsigned int max_sad);
 
+typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
+                                        int source_stride,
+                                        const uint8_t *ref_ptr,
+                                        int ref_stride,
+                                        const uint8_t *second_pred,
+                                        unsigned int max_sad);
+
 typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
                                    int source_stride,
                                    const uint8_t *ref_ptr,
@@ -74,20 +81,21 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
                                                    int  ref_stride);
 
 typedef struct vp9_variance_vtable {
-    vp9_sad_fn_t               sdf;
-    vp9_variance_fn_t          vf;
-    vp9_subpixvariance_fn_t    svf;
-    vp9_subp_avg_variance_fn_t svaf;
-    vp9_variance_fn_t          svf_halfpix_h;
-    vp9_variance_fn_t          svf_halfpix_v;
-    vp9_variance_fn_t          svf_halfpix_hv;
-    vp9_sad_multi_fn_t         sdx3f;
-    vp9_sad_multi1_fn_t        sdx8f;
-    vp9_sad_multi_d_fn_t       sdx4df;
+  vp9_sad_fn_t               sdf;
+  vp9_sad_avg_fn_t           sdaf;
+  vp9_variance_fn_t          vf;
+  vp9_subpixvariance_fn_t    svf;
+  vp9_subp_avg_variance_fn_t svaf;
+  vp9_variance_fn_t          svf_halfpix_h;
+  vp9_variance_fn_t          svf_halfpix_v;
+  vp9_variance_fn_t          svf_halfpix_hv;
+  vp9_sad_multi_fn_t         sdx3f;
+  vp9_sad_multi1_fn_t        sdx8f;
+  vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
 
 static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                          int height, uint8_t *ref, int ref_stride) {
+                          int height, const uint8_t *ref, int ref_stride) {
   int i, j;
 
   for (i = 0; i < height; i++) {
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm b/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
deleted file mode 100644
index 54766d8..0000000
--- a/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
+++ /dev/null
@@ -1,241 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)
-global sym(vp9_short_fdct4x4_mmx) PRIVATE
-sym(vp9_short_fdct4x4_mmx):
-    push        rbp
-    mov         rbp,        rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0)      ; input
-        mov         rdi,        arg(1)      ; output
-
-        movsxd      rax,        dword ptr arg(2) ;pitch
-
-        lea         rcx,        [rsi + rax*2]
-        ; read the input data
-        movq        mm0,        [rsi]
-        movq        mm1,        [rsi + rax]
-
-        movq        mm2,        [rcx]
-        movq        mm4,        [rcx + rax]
-
-        ; transpose for the first stage
-        movq        mm3,        mm0         ; 00 01 02 03
-        movq        mm5,        mm2         ; 20 21 22 23
-
-        punpcklwd   mm0,        mm1         ; 00 10 01 11
-        punpckhwd   mm3,        mm1         ; 02 12 03 13
-
-        punpcklwd   mm2,        mm4         ; 20 30 21 31
-        punpckhwd   mm5,        mm4         ; 22 32 23 33
-
-        movq        mm1,        mm0         ; 00 10 01 11
-        punpckldq   mm0,        mm2         ; 00 10 20 30
-
-        punpckhdq   mm1,        mm2         ; 01 11 21 31
-
-        movq        mm2,        mm3         ; 02 12 03 13
-        punpckldq   mm2,        mm5         ; 02 12 22 32
-
-        punpckhdq   mm3,        mm5         ; 03 13 23 33
-
-        ; mm0 0
-        ; mm1 1
-        ; mm2 2
-        ; mm3 3
-
-        ; first stage
-        movq        mm5,        mm0
-        movq        mm4,        mm1
-
-        paddw       mm0,        mm3         ; a1 = 0 + 3
-        paddw       mm1,        mm2         ; b1 = 1 + 2
-
-        psubw       mm4,        mm2         ; c1 = 1 - 2
-        psubw       mm5,        mm3         ; d1 = 0 - 3
-
-        psllw       mm5,        3
-        psllw       mm4,        3
-
-        psllw       mm0,        3
-        psllw       mm1,        3
-
-        ; output 0 and 2
-        movq        mm2,        mm0         ; a1
-
-        paddw       mm0,        mm1         ; op[0] = a1 + b1
-        psubw       mm2,        mm1         ; op[2] = a1 - b1
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movq        mm1,        mm5         ; d1
-        punpcklwd   mm1,        mm4         ; c1 d1
-        punpckhwd   mm5,        mm4         ; c1 d1
-
-        movq        mm3,        mm1
-        movq        mm4,        mm5
-
-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
-        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
-        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
-        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
-
-        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-
-        packssdw    mm1,        mm4         ; op[1]
-        packssdw    mm3,        mm5         ; op[3]
-
-        ; done with vertical
-        ; transpose for the second stage
-        movq        mm4,        mm0         ; 00 10 20 30
-        movq        mm5,        mm2         ; 02 12 22 32
-
-        punpcklwd   mm0,        mm1         ; 00 01 10 11
-        punpckhwd   mm4,        mm1         ; 20 21 30 31
-
-        punpcklwd   mm2,        mm3         ; 02 03 12 13
-        punpckhwd   mm5,        mm3         ; 22 23 32 33
-
-        movq        mm1,        mm0         ; 00 01 10 11
-        punpckldq   mm0,        mm2         ; 00 01 02 03
-
-        punpckhdq   mm1,        mm2         ; 01 22 12 13
-
-        movq        mm2,        mm4         ; 20 31 30 31
-        punpckldq   mm2,        mm5         ; 20 21 22 23
-
-        punpckhdq   mm4,        mm5         ; 30 31 32 33
-
-        ; mm0 0
-        ; mm1 1
-        ; mm2 2
-        ; mm3 4
-
-        movq        mm5,        mm0
-        movq        mm3,        mm1
-
-        paddw       mm0,        mm4         ; a1 = 0 + 3
-        paddw       mm1,        mm2         ; b1 = 1 + 2
-
-        psubw       mm3,        mm2         ; c1 = 1 - 2
-        psubw       mm5,        mm4         ; d1 = 0 - 3
-
-        pxor        mm6,        mm6         ; zero out for compare
-
-        pcmpeqw     mm6,        mm5         ; d1 != 0
-
-        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
-                                                                ; and keep bit 0 of lower
-
-        ; output 0 and 2
-        movq        mm2,        mm0         ; a1
-
-        paddw       mm0,        mm1         ; a1 + b1
-        psubw       mm2,        mm1         ; a1 - b1
-
-        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
-        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
-
-        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
-        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
-
-        movq        MMWORD PTR[rdi + 0 ],  mm0
-        movq        MMWORD PTR[rdi + 16],  mm2
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movq        mm1,        mm5         ; d1
-        punpcklwd   mm1,        mm3         ; c1 d1
-        punpckhwd   mm5,        mm3         ; c1 d1
-
-        movq        mm3,        mm1
-        movq        mm4,        mm5
-
-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
-        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
-        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
-        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
-
-        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-
-        packssdw    mm1,        mm4         ; op[4]
-        packssdw    mm3,        mm5         ; op[12]
-
-        paddw       mm1,        mm6         ; op[4] += (d1!=0)
-
-        movq        MMWORD PTR[rdi + 8 ],  mm1
-        movq        MMWORD PTR[rdi + 24],  mm3
-
-     ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 8
-_5352_2217:
-    dw 5352
-    dw 2217
-    dw 5352
-    dw 2217
-align 8
-_2217_neg5352:
-    dw 2217
-    dw -5352
-    dw 2217
-    dw -5352
-align 8
-_cmp_mask:
-    times 4 dw 1
-align 8
-_7w:
-    times 4 dw 7
-align 8
-_14500:
-    times 2 dd 14500
-align 8
-_7500:
-    times 2 dd 7500
-align 8
-_12000:
-    times 2 dd 12000
-align 8
-_51000:
-    times 2 dd 51000
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_mmx.h b/libvpx/vp9/encoder/x86/vp9_dct_mmx.h
deleted file mode 100644
index 3bac7c8..0000000
--- a/libvpx/vp9/encoder/x86/vp9_dct_mmx.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_X86_VP9_DCT_MMX_H_
-#define VP9_ENCODER_X86_VP9_DCT_MMX_H_
-
-extern void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch);
-
-
-#endif /* VP9_ENCODER_X86_VP9_DCT_MMX_H_ */
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index aaacebe..bf09c7a 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -10,6 +10,7 @@
 
 #include <emmintrin.h>  // SSE2
 #include "vp9/common/vp9_idct.h"  // for cospi constants
+#include "vpx_ports/mem.h"
 
 void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
   // The 2D transform is done with two passes which are actually pretty
@@ -116,6 +117,166 @@ void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) {
   vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch);
 }
 
+static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  __m128i mask;
+
+  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 4);
+  in[1] = _mm_slli_epi16(in[1], 4);
+  in[2] = _mm_slli_epi16(in[2], 4);
+  in[3] = _mm_slli_epi16(in[3], 4);
+
+  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+  in[0] = _mm_add_epi16(in[0], mask);
+  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+  __m128i out01 = _mm_add_epi16(in01, kOne);
+  __m128i out23 = _mm_add_epi16(in23, kOne);
+  out01 = _mm_srai_epi16(out01, 2);
+  out23 = _mm_srai_epi16(out23, 2);
+  _mm_store_si128((__m128i *)(output + 0 * 8), out01);
+  _mm_store_si128((__m128i *)(output + 1 * 8), out23);
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+  // Combine and transpose
+  // 00 01 02 03 20 21 22 23
+  // 10 11 12 13 30 31 32 33
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+  // 00 10 20 30 01 11 21 31
+  // 02 12 22 32 03 13 23 33
+  // only use the first 4 16-bit integers
+  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+void fdct4_1d_sse2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u[4], v[4];
+  u[0] = _mm_add_epi16(in[0], in[3]);
+  u[1] = _mm_add_epi16(in[1], in[2]);
+  u[2] = _mm_sub_epi16(in[1], in[2]);
+  u[3] = _mm_sub_epi16(in[0], in[3]);
+
+  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+  v[1] = _mm_unpacklo_epi16(u[2], u[3]);
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
+  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
+  u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08);  // 1
+  u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24);  // 3
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+  transpose_4x4(in);
+}
+
+void fadst4_1d_sse2(__m128i *in) {
+  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+  __m128i in7 = _mm_add_epi16(in[0], in[1]);
+  in7 = _mm_sub_epi16(in7, in[3]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpacklo_epi16(in[2], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = v[2];
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[2]);
+  in[1] = _mm_packs_epi32(u[1], u[3]);
+  transpose_4x4(in);
+}
+
+void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
+                           int stride, int tx_type) {
+  __m128i in[4];
+  load_buffer_4x4(input, in, stride);
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      fdct4_1d_sse2(in);
+      fdct4_1d_sse2(in);
+      break;
+    case 1:  // ADST_DCT
+      fadst4_1d_sse2(in);
+      fdct4_1d_sse2(in);
+      break;
+    case 2:  // DCT_ADST
+      fdct4_1d_sse2(in);
+      fadst4_1d_sse2(in);
+      break;
+    case 3:  // ADST_ADST
+      fadst4_1d_sse2(in);
+      fadst4_1d_sse2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  write_buffer_4x4(output, in);
+}
+
 void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
   const int stride = pitch >> 1;
   int pass;
@@ -133,14 +294,14 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   // Load input
-  __m128i in0  = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1  = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2  = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3  = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
-  __m128i in4  = _mm_loadu_si128((const __m128i *)(input + 4 * stride));
-  __m128i in5  = _mm_loadu_si128((const __m128i *)(input + 5 * stride));
-  __m128i in6  = _mm_loadu_si128((const __m128i *)(input + 6 * stride));
-  __m128i in7  = _mm_loadu_si128((const __m128i *)(input + 7 * stride));
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
   // Pre-condition input (shift by two)
   in0 = _mm_slli_epi16(in0, 2);
   in1 = _mm_slli_epi16(in1, 2);
@@ -362,15 +523,543 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
     in6 = _mm_srai_epi16(in6, 1);
     in7 = _mm_srai_epi16(in7, 1);
     // store results
-    _mm_storeu_si128((__m128i *)(output + 0 * 8), in0);
-    _mm_storeu_si128((__m128i *)(output + 1 * 8), in1);
-    _mm_storeu_si128((__m128i *)(output + 2 * 8), in2);
-    _mm_storeu_si128((__m128i *)(output + 3 * 8), in3);
-    _mm_storeu_si128((__m128i *)(output + 4 * 8), in4);
-    _mm_storeu_si128((__m128i *)(output + 5 * 8), in5);
-    _mm_storeu_si128((__m128i *)(output + 6 * 8), in6);
-    _mm_storeu_si128((__m128i *)(output + 7 * 8), in7);
+    _mm_store_si128((__m128i *)(output + 0 * 8), in0);
+    _mm_store_si128((__m128i *)(output + 1 * 8), in1);
+    _mm_store_si128((__m128i *)(output + 2 * 8), in2);
+    _mm_store_si128((__m128i *)(output + 3 * 8), in3);
+    _mm_store_si128((__m128i *)(output + 4 * 8), in4);
+    _mm_store_si128((__m128i *)(output + 5 * 8), in5);
+    _mm_store_si128((__m128i *)(output + 6 * 8), in6);
+    _mm_store_si128((__m128i *)(output + 7 * 8), in7);
+  }
+}
+
+// load 8x8 array
+static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {
+  in[0]  = _mm_load_si128((__m128i *)(input + 0 * stride));
+  in[1]  = _mm_load_si128((__m128i *)(input + 1 * stride));
+  in[2]  = _mm_load_si128((__m128i *)(input + 2 * stride));
+  in[3]  = _mm_load_si128((__m128i *)(input + 3 * stride));
+  in[4]  = _mm_load_si128((__m128i *)(input + 4 * stride));
+  in[5]  = _mm_load_si128((__m128i *)(input + 5 * stride));
+  in[6]  = _mm_load_si128((__m128i *)(input + 6 * stride));
+  in[7]  = _mm_load_si128((__m128i *)(input + 7 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 2);
+  in[1] = _mm_slli_epi16(in[1], 2);
+  in[2] = _mm_slli_epi16(in[2], 2);
+  in[3] = _mm_slli_epi16(in[3], 2);
+  in[4] = _mm_slli_epi16(in[4], 2);
+  in[5] = _mm_slli_epi16(in[5], 2);
+  in[6] = _mm_slli_epi16(in[6], 2);
+  in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8(__m128i *res, int const bit) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  const int bit_m02 = bit - 2;
+  __m128i sign0 = _mm_srai_epi16(res[0], 15);
+  __m128i sign1 = _mm_srai_epi16(res[1], 15);
+  __m128i sign2 = _mm_srai_epi16(res[2], 15);
+  __m128i sign3 = _mm_srai_epi16(res[3], 15);
+  __m128i sign4 = _mm_srai_epi16(res[4], 15);
+  __m128i sign5 = _mm_srai_epi16(res[5], 15);
+  __m128i sign6 = _mm_srai_epi16(res[6], 15);
+  __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+  if (bit_m02 >= 0) {
+    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
+    res[0] = _mm_add_epi16(res[0], k_const_rounding);
+    res[1] = _mm_add_epi16(res[1], k_const_rounding);
+    res[2] = _mm_add_epi16(res[2], k_const_rounding);
+    res[3] = _mm_add_epi16(res[3], k_const_rounding);
+    res[4] = _mm_add_epi16(res[4], k_const_rounding);
+    res[5] = _mm_add_epi16(res[5], k_const_rounding);
+    res[6] = _mm_add_epi16(res[6], k_const_rounding);
+    res[7] = _mm_add_epi16(res[7], k_const_rounding);
+  }
+
+  res[0] = _mm_sub_epi16(res[0], sign0);
+  res[1] = _mm_sub_epi16(res[1], sign1);
+  res[2] = _mm_sub_epi16(res[2], sign2);
+  res[3] = _mm_sub_epi16(res[3], sign3);
+  res[4] = _mm_sub_epi16(res[4], sign4);
+  res[5] = _mm_sub_epi16(res[5], sign5);
+  res[6] = _mm_sub_epi16(res[6], sign6);
+  res[7] = _mm_sub_epi16(res[7], sign7);
+
+  res[0] = _mm_srai_epi16(res[0], bit);
+  res[1] = _mm_srai_epi16(res[1], bit);
+  res[2] = _mm_srai_epi16(res[2], bit);
+  res[3] = _mm_srai_epi16(res[3], bit);
+  res[4] = _mm_srai_epi16(res[4], bit);
+  res[5] = _mm_srai_epi16(res[5], bit);
+  res[6] = _mm_srai_epi16(res[6], bit);
+  res[7] = _mm_srai_epi16(res[7], bit);
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
+  _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
+  _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
+  _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
+  _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
+  _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 44 54 45 55 46 56 47 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 25 35
+  // 44 54 64 74 45 55 65 75
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
+}
+
+void fdct8_1d_sse2(__m128i *in) {
+  // constants
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 1
+  s0 = _mm_add_epi16(in[0], in[7]);
+  s1 = _mm_add_epi16(in[1], in[6]);
+  s2 = _mm_add_epi16(in[2], in[5]);
+  s3 = _mm_add_epi16(in[3], in[4]);
+  s4 = _mm_sub_epi16(in[3], in[4]);
+  s5 = _mm_sub_epi16(in[2], in[5]);
+  s6 = _mm_sub_epi16(in[1], in[6]);
+  s7 = _mm_sub_epi16(in[0], in[7]);
+
+  u0 = _mm_add_epi16(s0, s3);
+  u1 = _mm_add_epi16(s1, s2);
+  u2 = _mm_sub_epi16(s1, s2);
+  u3 = _mm_sub_epi16(s0, s3);
+  // interleave and perform butterfly multiplication/addition
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpackhi_epi16(u0, u1);
+  v2 = _mm_unpacklo_epi16(u2, u3);
+  v3 = _mm_unpackhi_epi16(u2, u3);
+
+  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[4] = _mm_packs_epi32(u2, u3);
+  in[6] = _mm_packs_epi32(u6, u7);
+
+  // stage 2
+  // interleave and perform butterfly multiplication/addition
+  u0 = _mm_unpacklo_epi16(s6, s5);
+  u1 = _mm_unpackhi_epi16(s6, s5);
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+  u0 = _mm_packs_epi32(v0, v1);
+  u1 = _mm_packs_epi32(v2, v3);
+
+  // stage 3
+  s0 = _mm_add_epi16(s4, u0);
+  s1 = _mm_sub_epi16(s4, u0);
+  s2 = _mm_sub_epi16(s7, u1);
+  s3 = _mm_add_epi16(s7, u1);
+
+  // stage 4
+  u0 = _mm_unpacklo_epi16(s0, s3);
+  u1 = _mm_unpackhi_epi16(s0, s3);
+  u2 = _mm_unpacklo_epi16(s1, s2);
+  u3 = _mm_unpackhi_epi16(s1, s2);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  in[1] = _mm_packs_epi32(v0, v1);
+  in[3] = _mm_packs_epi32(v4, v5);
+  in[5] = _mm_packs_epi32(v2, v3);
+  in[7] = _mm_packs_epi32(v6, v7);
+
+  // transpose
+  array_transpose_8x8(in, in);
+}
+
+void fadst8_1d_sse2(__m128i *in) {
+  // Constants
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // properly aligned for butterfly input
+  in0  = in[7];
+  in1  = in[0];
+  in2  = in[5];
+  in3  = in[2];
+  in4  = in[3];
+  in5  = in[4];
+  in6  = in[1];
+  in7  = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  // FIXME(jingning): do subtract using bit inversion?
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+
+  // transpose
+  array_transpose_8x8(in, in);
+}
+
+void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
+                           int stride, int tx_type) {
+  __m128i in[8];
+  load_buffer_8x8(input, in, stride);
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      fdct8_1d_sse2(in);
+      fdct8_1d_sse2(in);
+      break;
+    case 1:  // ADST_DCT
+      fadst8_1d_sse2(in);
+      fdct8_1d_sse2(in);
+      break;
+    case 2:  // DCT_ADST
+      fdct8_1d_sse2(in);
+      fadst8_1d_sse2(in);
+      break;
+    case 3:  // ADST_ADST
+      fadst8_1d_sse2(in);
+      fadst8_1d_sse2(in);
+      break;
+    default:
+      assert(0);
+      break;
   }
+  right_shift_8x8(in, 1);
+  write_buffer_8x8(output, in, 8);
 }
 
 void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
@@ -383,7 +1072,7 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
   const int stride = pitch >> 1;
   int pass;
   // We need an intermediate buffer between passes.
-  int16_t intermediate[256];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
   int16_t *in = input;
   int16_t *out = intermediate;
   // Constants
@@ -426,22 +1115,22 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
       __m128i res08, res09, res10, res11, res12, res13, res14, res15;
       // Load and pre-condition input.
       if (0 == pass) {
-        in00  = _mm_loadu_si128((const __m128i *)(in +  0 * stride));
-        in01  = _mm_loadu_si128((const __m128i *)(in +  1 * stride));
-        in02  = _mm_loadu_si128((const __m128i *)(in +  2 * stride));
-        in03  = _mm_loadu_si128((const __m128i *)(in +  3 * stride));
-        in04  = _mm_loadu_si128((const __m128i *)(in +  4 * stride));
-        in05  = _mm_loadu_si128((const __m128i *)(in +  5 * stride));
-        in06  = _mm_loadu_si128((const __m128i *)(in +  6 * stride));
-        in07  = _mm_loadu_si128((const __m128i *)(in +  7 * stride));
-        in08  = _mm_loadu_si128((const __m128i *)(in +  8 * stride));
-        in09  = _mm_loadu_si128((const __m128i *)(in +  9 * stride));
-        in10  = _mm_loadu_si128((const __m128i *)(in + 10 * stride));
-        in11  = _mm_loadu_si128((const __m128i *)(in + 11 * stride));
-        in12  = _mm_loadu_si128((const __m128i *)(in + 12 * stride));
-        in13  = _mm_loadu_si128((const __m128i *)(in + 13 * stride));
-        in14  = _mm_loadu_si128((const __m128i *)(in + 14 * stride));
-        in15  = _mm_loadu_si128((const __m128i *)(in + 15 * stride));
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
         // x = x << 2
         in00 = _mm_slli_epi16(in00, 2);
         in01 = _mm_slli_epi16(in01, 2);
@@ -460,22 +1149,22 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
         in14 = _mm_slli_epi16(in14, 2);
         in15 = _mm_slli_epi16(in15, 2);
       } else {
-        in00  = _mm_loadu_si128((const __m128i *)(in +  0 * 16));
-        in01  = _mm_loadu_si128((const __m128i *)(in +  1 * 16));
-        in02  = _mm_loadu_si128((const __m128i *)(in +  2 * 16));
-        in03  = _mm_loadu_si128((const __m128i *)(in +  3 * 16));
-        in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 16));
-        in05  = _mm_loadu_si128((const __m128i *)(in +  5 * 16));
-        in06  = _mm_loadu_si128((const __m128i *)(in +  6 * 16));
-        in07  = _mm_loadu_si128((const __m128i *)(in +  7 * 16));
-        in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 16));
-        in09  = _mm_loadu_si128((const __m128i *)(in +  9 * 16));
-        in10  = _mm_loadu_si128((const __m128i *)(in + 10 * 16));
-        in11  = _mm_loadu_si128((const __m128i *)(in + 11 * 16));
-        in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 16));
-        in13  = _mm_loadu_si128((const __m128i *)(in + 13 * 16));
-        in14  = _mm_loadu_si128((const __m128i *)(in + 14 * 16));
-        in15  = _mm_loadu_si128((const __m128i *)(in + 15 * 16));
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
         // x = (x + 1) >> 2
         in00 = _mm_add_epi16(in00, kOne);
         in01 = _mm_add_epi16(in01, kOne);
@@ -982,14 +1671,14 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
         // 06 16 26 36 46 56 66 76
         // 07 17 27 37 47 57 67 77
         // Store results
-        _mm_storeu_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
-        _mm_storeu_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
-        _mm_storeu_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
-        _mm_storeu_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
-        _mm_storeu_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
-        _mm_storeu_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
-        _mm_storeu_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
-        _mm_storeu_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
+        _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
+        _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
+        _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
+        _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
+        _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
+        _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
+        _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
+        _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
       }
       out += 8*16;
     }
@@ -998,3 +1687,2109 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
     out = output;
   }
 }
+
+static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0,
+                                     __m128i *in1, int stride) {
+  // load first 8 columns
+  load_buffer_8x8(input, in0, stride);
+  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  load_buffer_8x8(input, in1, stride);
+  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
+                                      __m128i *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8(output, in0, stride);
+  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8(output, in1, stride);
+  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8(res0, res0);
+  array_transpose_8x8(res1, tbuf);
+  array_transpose_8x8(res0 + 8, res1);
+  array_transpose_8x8(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
+  // perform rounding operations
+  right_shift_8x8(res0, 2);
+  right_shift_8x8(res0 + 8, 2);
+  right_shift_8x8(res1, 2);
+  right_shift_8x8(res1 + 8, 2);
+}
+
+void fdct16_1d_8col(__m128i *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  // stage 1
+  i[0] = _mm_add_epi16(in[0], in[15]);
+  i[1] = _mm_add_epi16(in[1], in[14]);
+  i[2] = _mm_add_epi16(in[2], in[13]);
+  i[3] = _mm_add_epi16(in[3], in[12]);
+  i[4] = _mm_add_epi16(in[4], in[11]);
+  i[5] = _mm_add_epi16(in[5], in[10]);
+  i[6] = _mm_add_epi16(in[6], in[9]);
+  i[7] = _mm_add_epi16(in[7], in[8]);
+
+  s[0] = _mm_sub_epi16(in[7], in[8]);
+  s[1] = _mm_sub_epi16(in[6], in[9]);
+  s[2] = _mm_sub_epi16(in[5], in[10]);
+  s[3] = _mm_sub_epi16(in[4], in[11]);
+  s[4] = _mm_sub_epi16(in[3], in[12]);
+  s[5] = _mm_sub_epi16(in[2], in[13]);
+  s[6] = _mm_sub_epi16(in[1], in[14]);
+  s[7] = _mm_sub_epi16(in[0], in[15]);
+
+  p[0] = _mm_add_epi16(i[0], i[7]);
+  p[1] = _mm_add_epi16(i[1], i[6]);
+  p[2] = _mm_add_epi16(i[2], i[5]);
+  p[3] = _mm_add_epi16(i[3], i[4]);
+  p[4] = _mm_sub_epi16(i[3], i[4]);
+  p[5] = _mm_sub_epi16(i[2], i[5]);
+  p[6] = _mm_sub_epi16(i[1], i[6]);
+  p[7] = _mm_sub_epi16(i[0], i[7]);
+
+  u[0] = _mm_add_epi16(p[0], p[3]);
+  u[1] = _mm_add_epi16(p[1], p[2]);
+  u[2] = _mm_sub_epi16(p[1], p[2]);
+  u[3] = _mm_sub_epi16(p[0], p[3]);
+
+  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[4] = _mm_packs_epi32(u[4], u[5]);
+  in[8] = _mm_packs_epi32(u[2], u[3]);
+  in[12] = _mm_packs_epi32(u[6], u[7]);
+
+  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[2], v[3]);
+
+  t[0] = _mm_add_epi16(p[4], u[0]);
+  t[1] = _mm_sub_epi16(p[4], u[0]);
+  t[2] = _mm_sub_epi16(p[7], u[1]);
+  t[3] = _mm_add_epi16(p[7], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  in[2] = _mm_packs_epi32(v[0], v[1]);
+  in[6] = _mm_packs_epi32(v[4], v[5]);
+  in[10] = _mm_packs_epi32(v[2], v[3]);
+  in[14] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[2] = _mm_packs_epi32(v[0], v[1]);
+  t[3] = _mm_packs_epi32(v[2], v[3]);
+  t[4] = _mm_packs_epi32(v[4], v[5]);
+  t[5] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 3
+  p[0] = _mm_add_epi16(s[0], t[3]);
+  p[1] = _mm_add_epi16(s[1], t[2]);
+  p[2] = _mm_sub_epi16(s[1], t[2]);
+  p[3] = _mm_sub_epi16(s[0], t[3]);
+  p[4] = _mm_sub_epi16(s[7], t[4]);
+  p[5] = _mm_sub_epi16(s[6], t[5]);
+  p[6] = _mm_add_epi16(s[6], t[5]);
+  p[7] = _mm_add_epi16(s[7], t[4]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[1] = _mm_packs_epi32(v[0], v[1]);
+  t[2] = _mm_packs_epi32(v[2], v[3]);
+  t[5] = _mm_packs_epi32(v[4], v[5]);
+  t[6] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 5
+  s[0] = _mm_add_epi16(p[0], t[1]);
+  s[1] = _mm_sub_epi16(p[0], t[1]);
+  s[2] = _mm_sub_epi16(p[3], t[2]);
+  s[3] = _mm_add_epi16(p[3], t[2]);
+  s[4] = _mm_add_epi16(p[4], t[5]);
+  s[5] = _mm_sub_epi16(p[4], t[5]);
+  s[6] = _mm_sub_epi16(p[7], t[6]);
+  s[7] = _mm_add_epi16(p[7], t[6]);
+
+  // stage 6
+  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[1]  = _mm_packs_epi32(v[0], v[1]);
+  in[9]  = _mm_packs_epi32(v[2], v[3]);
+  in[5]  = _mm_packs_epi32(v[4], v[5]);
+  in[13] = _mm_packs_epi32(v[6], v[7]);
+  in[3]  = _mm_packs_epi32(v[8], v[9]);
+  in[11] = _mm_packs_epi32(v[10], v[11]);
+  in[7]  = _mm_packs_epi32(v[12], v[13]);
+  in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+void fadst16_1d_8col(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+void fdct16_1d_sse2(__m128i *in0, __m128i *in1) {
+  fdct16_1d_8col(in0);
+  fdct16_1d_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+
+void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
+  fadst16_1d_8col(in0);
+  fadst16_1d_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+
+void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
+                             int stride, int tx_type) {
+  __m128i in0[16], in1[16];
+  load_buffer_16x16(input, in0, in1, stride);
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      fdct16_1d_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_1d_sse2(in0, in1);
+      break;
+    case 1:  // ADST_DCT
+      fadst16_1d_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_1d_sse2(in0, in1);
+      break;
+    case 2:  // DCT_ADST
+      fdct16_1d_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_1d_sse2(in0, in1);
+      break;
+    case 3:  // ADST_ADST
+      fadst16_1d_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_1d_sse2(in0, in1);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  write_buffer_16x16(output, in0, in1, 16);
+}
+
+void vp9_short_fdct32x32_rd_sse2(int16_t *input,
+                                 int16_t *output_org, int pitch) {
+  // Calculate pre-multiplied strides
+  const int str1 = pitch >> 1;
+  const int str2 = pitch;
+  const int str3 = pitch + str1;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64,   cospi_24_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64,  cospi_8_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64,  cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64,  cospi_12_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64,   cospi_28_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64,  cospi_4_64);
+  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64,  cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64,  cospi_18_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64,  cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64,   cospi_26_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64,  cospi_6_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64,  cospi_22_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64,  cospi_14_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64,   cospi_30_64);
+  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64,  cospi_1_64);
+  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64,  cospi_17_64);
+  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64,  cospi_9_64);
+  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64,   cospi_25_64);
+  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64,  cospi_7_64);
+  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64,   cospi_23_64);
+  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64,  cospi_15_64);
+  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64,   cospi_31_64);
+  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64,  cospi_5_64);
+  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64,  cospi_21_64);
+  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64,  cospi_13_64);
+  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64,   cospi_29_64);
+  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64,  cospi_3_64);
+  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64,  cospi_19_64);
+  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64,  cospi_11_64);
+  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64,   cospi_27_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kOne  = _mm_set1_epi16(1);
+  // Do the two transform/transpose passes
+  int pass;
+  for (pass = 0; pass < 2; ++pass) {
+    // We process eight columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 32; column_start += 8) {
+      __m128i step1[32];
+      __m128i step2[32];
+      __m128i step3[32];
+      __m128i out[32];
+      // Stage 1
+      // Note: even though all the loads below are aligned, using the aligned
+      //       intrinsic make the code slightly slower.
+      if (0 == pass) {
+        int16_t *in  = &input[column_start];
+        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          int16_t *ina =  in +  0 * str1;
+          int16_t *inb =  in + 31 * str1;
+          __m128i *step1a = &step1[ 0];
+          __m128i *step1b = &step1[31];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+        {
+          int16_t *ina =  in +  4 * str1;
+          int16_t *inb =  in + 27 * str1;
+          __m128i *step1a = &step1[ 4];
+          __m128i *step1b = &step1[27];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+        {
+          int16_t *ina =  in +  8 * str1;
+          int16_t *inb =  in + 23 * str1;
+          __m128i *step1a = &step1[ 8];
+          __m128i *step1b = &step1[23];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+        {
+          int16_t *ina =  in + 12 * str1;
+          int16_t *inb =  in + 19 * str1;
+          __m128i *step1a = &step1[12];
+          __m128i *step1b = &step1[19];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+      } else {
+        int16_t *in = &intermediate[column_start];
+        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
+        // Note: using the same approach as above to have common offset is
+        //       counter-productive as all offsets can be calculated at compile
+        //       time.
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          __m128i in00  = _mm_loadu_si128((const __m128i *)(in +  0 * 32));
+          __m128i in01  = _mm_loadu_si128((const __m128i *)(in +  1 * 32));
+          __m128i in02  = _mm_loadu_si128((const __m128i *)(in +  2 * 32));
+          __m128i in03  = _mm_loadu_si128((const __m128i *)(in +  3 * 32));
+          __m128i in28  = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
+          __m128i in29  = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
+          __m128i in30  = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
+          __m128i in31  = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
+          step1[ 0] = _mm_add_epi16(in00, in31);
+          step1[ 1] = _mm_add_epi16(in01, in30);
+          step1[ 2] = _mm_add_epi16(in02, in29);
+          step1[ 3] = _mm_add_epi16(in03, in28);
+          step1[28] = _mm_sub_epi16(in03, in28);
+          step1[29] = _mm_sub_epi16(in02, in29);
+          step1[30] = _mm_sub_epi16(in01, in30);
+          step1[31] = _mm_sub_epi16(in00, in31);
+        }
+        {
+          __m128i in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 32));
+          __m128i in05  = _mm_loadu_si128((const __m128i *)(in +  5 * 32));
+          __m128i in06  = _mm_loadu_si128((const __m128i *)(in +  6 * 32));
+          __m128i in07  = _mm_loadu_si128((const __m128i *)(in +  7 * 32));
+          __m128i in24  = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
+          __m128i in25  = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
+          __m128i in26  = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
+          __m128i in27  = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
+          step1[ 4] = _mm_add_epi16(in04, in27);
+          step1[ 5] = _mm_add_epi16(in05, in26);
+          step1[ 6] = _mm_add_epi16(in06, in25);
+          step1[ 7] = _mm_add_epi16(in07, in24);
+          step1[24] = _mm_sub_epi16(in07, in24);
+          step1[25] = _mm_sub_epi16(in06, in25);
+          step1[26] = _mm_sub_epi16(in05, in26);
+          step1[27] = _mm_sub_epi16(in04, in27);
+        }
+        {
+          __m128i in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 32));
+          __m128i in09  = _mm_loadu_si128((const __m128i *)(in +  9 * 32));
+          __m128i in10  = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
+          __m128i in11  = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
+          __m128i in20  = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
+          __m128i in21  = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
+          __m128i in22  = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
+          __m128i in23  = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
+          step1[ 8] = _mm_add_epi16(in08, in23);
+          step1[ 9] = _mm_add_epi16(in09, in22);
+          step1[10] = _mm_add_epi16(in10, in21);
+          step1[11] = _mm_add_epi16(in11, in20);
+          step1[20] = _mm_sub_epi16(in11, in20);
+          step1[21] = _mm_sub_epi16(in10, in21);
+          step1[22] = _mm_sub_epi16(in09, in22);
+          step1[23] = _mm_sub_epi16(in08, in23);
+        }
+        {
+          __m128i in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
+          __m128i in13  = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
+          __m128i in14  = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
+          __m128i in15  = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
+          __m128i in16  = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
+          __m128i in17  = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
+          __m128i in18  = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
+          __m128i in19  = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
+          step1[12] = _mm_add_epi16(in12, in19);
+          step1[13] = _mm_add_epi16(in13, in18);
+          step1[14] = _mm_add_epi16(in14, in17);
+          step1[15] = _mm_add_epi16(in15, in16);
+          step1[16] = _mm_sub_epi16(in15, in16);
+          step1[17] = _mm_sub_epi16(in14, in17);
+          step1[18] = _mm_sub_epi16(in13, in18);
+          step1[19] = _mm_sub_epi16(in12, in19);
+        }
+      }
+      // Stage 2
+      {
+        step2[ 0] = _mm_add_epi16(step1[0], step1[15]);
+        step2[ 1] = _mm_add_epi16(step1[1], step1[14]);
+        step2[ 2] = _mm_add_epi16(step1[2], step1[13]);
+        step2[ 3] = _mm_add_epi16(step1[3], step1[12]);
+        step2[ 4] = _mm_add_epi16(step1[4], step1[11]);
+        step2[ 5] = _mm_add_epi16(step1[5], step1[10]);
+        step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]);
+        step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]);
+        step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]);
+        step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]);
+        step2[10] = _mm_sub_epi16(step1[5], step1[10]);
+        step2[11] = _mm_sub_epi16(step1[4], step1[11]);
+        step2[12] = _mm_sub_epi16(step1[3], step1[12]);
+        step2[13] = _mm_sub_epi16(step1[2], step1[13]);
+        step2[14] = _mm_sub_epi16(step1[1], step1[14]);
+        step2[15] = _mm_sub_epi16(step1[0], step1[15]);
+      }
+      {
+        const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
+        const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
+        const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
+        const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
+        const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
+        const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
+        const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
+        const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
+        const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
+        const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
+        const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
+        const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
+        const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
+        const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
+        const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
+        const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
+        const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
+        const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
+        const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
+        const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
+        const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
+        const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
+        const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
+        const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
+        const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
+        const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
+        const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
+        const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
+        const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
+        const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
+        const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
+        const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
+        const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
+        const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
+        const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
+        const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
+        const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
+        const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
+        const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
+        // Combine
+        step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
+        step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
+        step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
+        step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
+        step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
+        step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
+        step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
+        step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+      }
+      // Stage 3
+      {
+        step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
+        step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
+        step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
+        step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
+        step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
+        step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
+        step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
+        step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
+      }
+      {
+        const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+        const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+        const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+        const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+        const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+        const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+        const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+        const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+        const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+        const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+        const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+        const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+        const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+        const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+        const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+        const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+        const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+        const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+        const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        // Combine
+        step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+        step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+        step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+        step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+      }
+      {
+        step3[16] = _mm_add_epi16(step2[23], step1[16]);
+        step3[17] = _mm_add_epi16(step2[22], step1[17]);
+        step3[18] = _mm_add_epi16(step2[21], step1[18]);
+        step3[19] = _mm_add_epi16(step2[20], step1[19]);
+        step3[20] = _mm_sub_epi16(step1[19], step2[20]);
+        step3[21] = _mm_sub_epi16(step1[18], step2[21]);
+        step3[22] = _mm_sub_epi16(step1[17], step2[22]);
+        step3[23] = _mm_sub_epi16(step1[16], step2[23]);
+        step3[24] = _mm_sub_epi16(step1[31], step2[24]);
+        step3[25] = _mm_sub_epi16(step1[30], step2[25]);
+        step3[26] = _mm_sub_epi16(step1[29], step2[26]);
+        step3[27] = _mm_sub_epi16(step1[28], step2[27]);
+        step3[28] = _mm_add_epi16(step2[27], step1[28]);
+        step3[29] = _mm_add_epi16(step2[26], step1[29]);
+        step3[30] = _mm_add_epi16(step2[25], step1[30]);
+        step3[31] = _mm_add_epi16(step2[24], step1[31]);
+      }
+      // dump the magnitude by half, hence the intermediate values are within
+      // the range of 16 bits.
+      if (1 == pass) {
+        __m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero);
+        __m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero);
+        __m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero);
+        __m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero);
+        __m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero);
+        __m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero);
+        __m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero);
+        __m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero);
+        __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
+        __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
+        __m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero);
+        __m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero);
+        __m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero);
+        __m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero);
+        __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+        __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+        __m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero);
+        __m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero);
+        __m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero);
+        __m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero);
+        __m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero);
+        __m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero);
+        __m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero);
+        __m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero);
+        __m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero);
+        __m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero);
+        __m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero);
+        __m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero);
+        __m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero);
+        __m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero);
+        __m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero);
+        __m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero);
+        step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0);
+        step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0);
+        step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0);
+        step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0);
+        step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0);
+        step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0);
+        step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0);
+        step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0);
+        step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
+        step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
+        step3[10] = _mm_sub_epi16(step3[10], s3_10_0);
+        step3[11] = _mm_sub_epi16(step3[11], s3_11_0);
+        step3[12] = _mm_sub_epi16(step3[12], s3_12_0);
+        step3[13] = _mm_sub_epi16(step3[13], s3_13_0);
+        step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
+        step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
+        step3[16] = _mm_sub_epi16(step3[16], s3_16_0);
+        step3[17] = _mm_sub_epi16(step3[17], s3_17_0);
+        step3[18] = _mm_sub_epi16(step3[18], s3_18_0);
+        step3[19] = _mm_sub_epi16(step3[19], s3_19_0);
+        step3[20] = _mm_sub_epi16(step3[20], s3_20_0);
+        step3[21] = _mm_sub_epi16(step3[21], s3_21_0);
+        step3[22] = _mm_sub_epi16(step3[22], s3_22_0);
+        step3[23] = _mm_sub_epi16(step3[23], s3_23_0);
+        step3[24] = _mm_sub_epi16(step3[24], s3_24_0);
+        step3[25] = _mm_sub_epi16(step3[25], s3_25_0);
+        step3[26] = _mm_sub_epi16(step3[26], s3_26_0);
+        step3[27] = _mm_sub_epi16(step3[27], s3_27_0);
+        step3[28] = _mm_sub_epi16(step3[28], s3_28_0);
+        step3[29] = _mm_sub_epi16(step3[29], s3_29_0);
+        step3[30] = _mm_sub_epi16(step3[30], s3_30_0);
+        step3[31] = _mm_sub_epi16(step3[31], s3_31_0);
+        step3[ 0] = _mm_add_epi16(step3[ 0], kOne);
+        step3[ 1] = _mm_add_epi16(step3[ 1], kOne);
+        step3[ 2] = _mm_add_epi16(step3[ 2], kOne);
+        step3[ 3] = _mm_add_epi16(step3[ 3], kOne);
+        step3[ 4] = _mm_add_epi16(step3[ 4], kOne);
+        step3[ 5] = _mm_add_epi16(step3[ 5], kOne);
+        step3[ 6] = _mm_add_epi16(step3[ 6], kOne);
+        step3[ 7] = _mm_add_epi16(step3[ 7], kOne);
+        step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
+        step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
+        step3[10] = _mm_add_epi16(step3[10], kOne);
+        step3[11] = _mm_add_epi16(step3[11], kOne);
+        step3[12] = _mm_add_epi16(step3[12], kOne);
+        step3[13] = _mm_add_epi16(step3[13], kOne);
+        step2[14] = _mm_add_epi16(step2[14], kOne);
+        step2[15] = _mm_add_epi16(step2[15], kOne);
+        step3[16] = _mm_add_epi16(step3[16], kOne);
+        step3[17] = _mm_add_epi16(step3[17], kOne);
+        step3[18] = _mm_add_epi16(step3[18], kOne);
+        step3[19] = _mm_add_epi16(step3[19], kOne);
+        step3[20] = _mm_add_epi16(step3[20], kOne);
+        step3[21] = _mm_add_epi16(step3[21], kOne);
+        step3[22] = _mm_add_epi16(step3[22], kOne);
+        step3[23] = _mm_add_epi16(step3[23], kOne);
+        step3[24] = _mm_add_epi16(step3[24], kOne);
+        step3[25] = _mm_add_epi16(step3[25], kOne);
+        step3[26] = _mm_add_epi16(step3[26], kOne);
+        step3[27] = _mm_add_epi16(step3[27], kOne);
+        step3[28] = _mm_add_epi16(step3[28], kOne);
+        step3[29] = _mm_add_epi16(step3[29], kOne);
+        step3[30] = _mm_add_epi16(step3[30], kOne);
+        step3[31] = _mm_add_epi16(step3[31], kOne);
+        step3[ 0] = _mm_srai_epi16(step3[ 0], 2);
+        step3[ 1] = _mm_srai_epi16(step3[ 1], 2);
+        step3[ 2] = _mm_srai_epi16(step3[ 2], 2);
+        step3[ 3] = _mm_srai_epi16(step3[ 3], 2);
+        step3[ 4] = _mm_srai_epi16(step3[ 4], 2);
+        step3[ 5] = _mm_srai_epi16(step3[ 5], 2);
+        step3[ 6] = _mm_srai_epi16(step3[ 6], 2);
+        step3[ 7] = _mm_srai_epi16(step3[ 7], 2);
+        step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
+        step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
+        step3[10] = _mm_srai_epi16(step3[10], 2);
+        step3[11] = _mm_srai_epi16(step3[11], 2);
+        step3[12] = _mm_srai_epi16(step3[12], 2);
+        step3[13] = _mm_srai_epi16(step3[13], 2);
+        step2[14] = _mm_srai_epi16(step2[14], 2);
+        step2[15] = _mm_srai_epi16(step2[15], 2);
+        step3[16] = _mm_srai_epi16(step3[16], 2);
+        step3[17] = _mm_srai_epi16(step3[17], 2);
+        step3[18] = _mm_srai_epi16(step3[18], 2);
+        step3[19] = _mm_srai_epi16(step3[19], 2);
+        step3[20] = _mm_srai_epi16(step3[20], 2);
+        step3[21] = _mm_srai_epi16(step3[21], 2);
+        step3[22] = _mm_srai_epi16(step3[22], 2);
+        step3[23] = _mm_srai_epi16(step3[23], 2);
+        step3[24] = _mm_srai_epi16(step3[24], 2);
+        step3[25] = _mm_srai_epi16(step3[25], 2);
+        step3[26] = _mm_srai_epi16(step3[26], 2);
+        step3[27] = _mm_srai_epi16(step3[27], 2);
+        step3[28] = _mm_srai_epi16(step3[28], 2);
+        step3[29] = _mm_srai_epi16(step3[29], 2);
+        step3[30] = _mm_srai_epi16(step3[30], 2);
+        step3[31] = _mm_srai_epi16(step3[31], 2);
+      }
+      // Stage 4
+      {
+        step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
+        step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]);
+        step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]);
+        step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]);
+        step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]);
+        step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]);
+        step1[10] = _mm_sub_epi16(step2[ 9], step3[10]);
+        step1[11] = _mm_sub_epi16(step2[ 8], step3[11]);
+        step1[12] = _mm_sub_epi16(step2[15], step3[12]);
+        step1[13] = _mm_sub_epi16(step2[14], step3[13]);
+        step1[14] = _mm_add_epi16(step3[13], step2[14]);
+        step1[15] = _mm_add_epi16(step3[12], step2[15]);
+      }
+      {
+        const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+        const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+        const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+        const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+        const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+        const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+        const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+        const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+        const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+        // Combine
+        step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+        step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+      }
+      {
+        const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+        const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+        const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+        const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+        const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+        const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+        const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+        const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+        const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+        const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+        const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+        const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+        const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+        const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+        const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+        const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+        const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+        const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+        const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+        const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+        const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+        const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+        const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+        const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+        const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+        const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+        const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+        const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+        const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+        const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+        const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+        const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+        const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+        const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+        const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+        const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+        const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+        const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+        const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+        // Combine
+        step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+        step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+        step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+        step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+        step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+        step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+        step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+        step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+      }
+      // Stage 5
+      {
+        step2[4] = _mm_add_epi16(step1[5], step3[4]);
+        step2[5] = _mm_sub_epi16(step3[4], step1[5]);
+        step2[6] = _mm_sub_epi16(step3[7], step1[6]);
+        step2[7] = _mm_add_epi16(step1[6], step3[7]);
+      }
+      {
+        const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+        const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+        const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+        const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+        const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+        const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+        const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+        const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+        const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+        const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+        const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+        const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+        const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+        const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+        const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+        const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+        const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+        const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+        const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+        // Combine
+        out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
+        out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+        out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
+        out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+      }
+      {
+        const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
+        const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
+        const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+        const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+        const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+        const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+        const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+        const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+        const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+        const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+        const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+        const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+        const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+        const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+        const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+        const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+        const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+        const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+        const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+        // Combine
+        step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+        step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+        step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+        step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+      }
+      {
+        step2[16] = _mm_add_epi16(step1[19], step3[16]);
+        step2[17] = _mm_add_epi16(step1[18], step3[17]);
+        step2[18] = _mm_sub_epi16(step3[17], step1[18]);
+        step2[19] = _mm_sub_epi16(step3[16], step1[19]);
+        step2[20] = _mm_sub_epi16(step3[23], step1[20]);
+        step2[21] = _mm_sub_epi16(step3[22], step1[21]);
+        step2[22] = _mm_add_epi16(step1[21], step3[22]);
+        step2[23] = _mm_add_epi16(step1[20], step3[23]);
+        step2[24] = _mm_add_epi16(step1[27], step3[24]);
+        step2[25] = _mm_add_epi16(step1[26], step3[25]);
+        step2[26] = _mm_sub_epi16(step3[25], step1[26]);
+        step2[27] = _mm_sub_epi16(step3[24], step1[27]);
+        step2[28] = _mm_sub_epi16(step3[31], step1[28]);
+        step2[29] = _mm_sub_epi16(step3[30], step1[29]);
+        step2[30] = _mm_add_epi16(step1[29], step3[30]);
+        step2[31] = _mm_add_epi16(step1[28], step3[31]);
+      }
+      // Stage 6
+      {
+        const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+        const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+        const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+        const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+        const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+        const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+        const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+        const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+        const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+        const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+        const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+        const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+        const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+        const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+        const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+        const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+        // dct_const_round_shift
+        const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+        const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+        const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+        const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+        const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+        const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+        const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+        const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+        // Combine
+        out[ 4] = _mm_packs_epi32(out_04_6, out_04_7);
+        out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+        out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+        out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+      }
+      {
+        step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]);
+        step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]);
+        step3[10] = _mm_sub_epi16(step1[11], step2[10]);
+        step3[11] = _mm_add_epi16(step2[10], step1[11]);
+        step3[12] = _mm_add_epi16(step2[13], step1[12]);
+        step3[13] = _mm_sub_epi16(step1[12], step2[13]);
+        step3[14] = _mm_sub_epi16(step1[15], step2[14]);
+        step3[15] = _mm_add_epi16(step2[14], step1[15]);
+      }
+      {
+        const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+        const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+        const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+        const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+        const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+        const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+        const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+        const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+        const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+        const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+        const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+        const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+        const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+        const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+        const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+        const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+        const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+        const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+        const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+        const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+        const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+        const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+        const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+        const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+        // dct_const_round_shift
+        const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+        const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+        const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+        const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+        const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+        const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+        const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+        const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+        const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+        const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+        const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+        const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+        const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+        const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+        const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+        const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+        // Combine
+        step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+        step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+        step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+        step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+        // Combine
+        step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+        step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+        step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+        step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+      }
+      // Stage 7
+      {
+        const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
+        const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
+        const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
+        const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
+        const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+        const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+        const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+        const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+        const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+        const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+        const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+        const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+        const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+        const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+        const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+        const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+        const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+        const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+        const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+        const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+        const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+        const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+        const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+        const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+        // dct_const_round_shift
+        const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+        const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+        const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+        const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+        const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+        const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+        const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+        const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+        const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+        const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+        const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+        const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+        const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+        const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+        const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+        const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+        // Combine
+        out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
+        out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+        out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+        out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+        out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
+        out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+        out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+        out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+      }
+      {
+        step1[16] = _mm_add_epi16(step3[17], step2[16]);
+        step1[17] = _mm_sub_epi16(step2[16], step3[17]);
+        step1[18] = _mm_sub_epi16(step2[19], step3[18]);
+        step1[19] = _mm_add_epi16(step3[18], step2[19]);
+        step1[20] = _mm_add_epi16(step3[21], step2[20]);
+        step1[21] = _mm_sub_epi16(step2[20], step3[21]);
+        step1[22] = _mm_sub_epi16(step2[23], step3[22]);
+        step1[23] = _mm_add_epi16(step3[22], step2[23]);
+        step1[24] = _mm_add_epi16(step3[25], step2[24]);
+        step1[25] = _mm_sub_epi16(step2[24], step3[25]);
+        step1[26] = _mm_sub_epi16(step2[27], step3[26]);
+        step1[27] = _mm_add_epi16(step3[26], step2[27]);
+        step1[28] = _mm_add_epi16(step3[29], step2[28]);
+        step1[29] = _mm_sub_epi16(step2[28], step3[29]);
+        step1[30] = _mm_sub_epi16(step2[31], step3[30]);
+        step1[31] = _mm_add_epi16(step3[30], step2[31]);
+      }
+      // Final stage --- outputs indices are bit-reversed.
+      {
+        const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+        const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+        const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+        const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+        const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+        const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+        const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+        const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+        const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+        const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+        const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+        const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+        const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+        const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+        const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+        const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+        const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+        const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+        const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+        const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+        const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+        const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+        const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+        const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+        // dct_const_round_shift
+        const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+        const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+        const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+        const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+        const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+        const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+        const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+        const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+        const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+        const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+        const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+        const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+        const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+        const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+        const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+        const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+        // Combine
+        out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
+        out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+        out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
+        out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+        out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
+        out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+        out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+        out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+      }
+      {
+        const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+        const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+        const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+        const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+        const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+        const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+        const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+        const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+        const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+        const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+        const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+        const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+        const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+        const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+        const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+        const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+        const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+        const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+        const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+        const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+        const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+        const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+        const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+        const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+        // dct_const_round_shift
+        const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+        const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+        const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+        const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+        const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+        const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+        const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+        const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+        const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+        const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+        const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+        const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+        const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+        const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+        const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+        const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+        // Combine
+        out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
+        out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+        out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+        out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+        out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
+        out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+        out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+        out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+      }
+      // Transpose the results, do it as four 8x8 transposes.
+      {
+        int transpose_block;
+        int16_t *output;
+        if (0 == pass) {
+          output = &intermediate[column_start * 32];
+        } else {
+          output = &output_org[column_start * 32];
+        }
+        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+          __m128i *this_out = &out[8 * transpose_block];
+          // 00 01 02 03 04 05 06 07
+          // 10 11 12 13 14 15 16 17
+          // 20 21 22 23 24 25 26 27
+          // 30 31 32 33 34 35 36 37
+          // 40 41 42 43 44 45 46 47
+          // 50 51 52 53 54 55 56 57
+          // 60 61 62 63 64 65 66 67
+          // 70 71 72 73 74 75 76 77
+          const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
+          const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
+          const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
+          const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
+          const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
+          const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
+          const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
+          const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
+          // 00 10 01 11 02 12 03 13
+          // 20 30 21 31 22 32 23 33
+          // 04 14 05 15 06 16 07 17
+          // 24 34 25 35 26 36 27 37
+          // 40 50 41 51 42 52 43 53
+          // 60 70 61 71 62 72 63 73
+          // 54 54 55 55 56 56 57 57
+          // 64 74 65 75 66 76 67 77
+          const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+          const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+          const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+          const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+          const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+          const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+          const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+          const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+          // 00 10 20 30 01 11 21 31
+          // 40 50 60 70 41 51 61 71
+          // 02 12 22 32 03 13 23 33
+          // 42 52 62 72 43 53 63 73
+          // 04 14 24 34 05 15 21 36
+          // 44 54 64 74 45 55 61 76
+          // 06 16 26 36 07 17 27 37
+          // 46 56 66 76 47 57 67 77
+          __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+          __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+          __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+          __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+          __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+          __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+          __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+          __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+          // 00 10 20 30 40 50 60 70
+          // 01 11 21 31 41 51 61 71
+          // 02 12 22 32 42 52 62 72
+          // 03 13 23 33 43 53 63 73
+          // 04 14 24 34 44 54 64 74
+          // 05 15 25 35 45 55 65 75
+          // 06 16 26 36 46 56 66 76
+          // 07 17 27 37 47 57 67 77
+          if (0 == pass) {
+            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+            // TODO(cd): see quality impact of only doing
+            //           output[j] = (output[j] + 1) >> 2;
+            //           which would remove the code between here ...
+            __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
+            __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
+            __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
+            __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
+            __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
+            __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
+            __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
+            __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
+            tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
+            tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
+            tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
+            tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
+            tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
+            tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
+            tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
+            tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
+            //           ... and here.
+            //           PS: also change code in vp9/encoder/vp9_dct.c
+            tr2_0 = _mm_add_epi16(tr2_0, kOne);
+            tr2_1 = _mm_add_epi16(tr2_1, kOne);
+            tr2_2 = _mm_add_epi16(tr2_2, kOne);
+            tr2_3 = _mm_add_epi16(tr2_3, kOne);
+            tr2_4 = _mm_add_epi16(tr2_4, kOne);
+            tr2_5 = _mm_add_epi16(tr2_5, kOne);
+            tr2_6 = _mm_add_epi16(tr2_6, kOne);
+            tr2_7 = _mm_add_epi16(tr2_7, kOne);
+            tr2_0 = _mm_srai_epi16(tr2_0, 2);
+            tr2_1 = _mm_srai_epi16(tr2_1, 2);
+            tr2_2 = _mm_srai_epi16(tr2_2, 2);
+            tr2_3 = _mm_srai_epi16(tr2_3, 2);
+            tr2_4 = _mm_srai_epi16(tr2_4, 2);
+            tr2_5 = _mm_srai_epi16(tr2_5, 2);
+            tr2_6 = _mm_srai_epi16(tr2_6, 2);
+            tr2_7 = _mm_srai_epi16(tr2_7, 2);
+          }
+          // Note: even though all these stores are aligned, using the aligned
+          //       intrinsic make the code slightly slower.
+          _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0);
+          _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1);
+          _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2);
+          _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3);
+          _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4);
+          _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5);
+          _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6);
+          _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7);
+          // Process next 8x8
+          output += 8;
+        }
+      }
+    }
+  }
+}
diff --git a/libvpx/vp9/encoder/x86/vp9_encodeopt.asm b/libvpx/vp9/encoder/x86/vp9_encodeopt.asm
deleted file mode 100644
index 734cb61..0000000
--- a/libvpx/vp9/encoder/x86/vp9_encodeopt.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp9_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
-global sym(vp9_block_error_xmm) PRIVATE
-sym(vp9_block_error_xmm):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prologue
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        mov         rdi,        arg(1) ;dcoef_ptr
-
-        movdqa      xmm0,       [rsi]
-        movdqa      xmm1,       [rdi]
-
-        movdqa      xmm2,       [rsi+16]
-        movdqa      xmm3,       [rdi+16]
-
-        psubw       xmm0,       xmm1
-        psubw       xmm2,       xmm3
-
-        pmaddwd     xmm0,       xmm0
-        pmaddwd     xmm2,       xmm2
-
-        paddd       xmm0,       xmm2
-
-        pxor        xmm5,       xmm5
-        movdqa      xmm1,       xmm0
-
-        punpckldq   xmm0,       xmm5
-        punpckhdq   xmm1,       xmm5
-
-        paddd       xmm0,       xmm1
-        movdqa      xmm1,       xmm0
-
-        psrldq      xmm0,       8
-        paddd       xmm0,       xmm1
-
-        movq        rax,        xmm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;int vp9_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
-global sym(vp9_block_error_mmx) PRIVATE
-sym(vp9_block_error_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        pxor        mm7,        mm7
-
-        mov         rdi,        arg(1) ;dcoef_ptr
-        movq        mm3,        [rsi]
-
-        movq        mm4,        [rdi]
-        movq        mm5,        [rsi+8]
-
-        movq        mm6,        [rdi+8]
-        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
-
-        movq        mm2,        mm7
-        psubw       mm5,        mm6
-
-        por         mm1,        mm2
-        pmaddwd     mm5,        mm5
-
-        pcmpeqw     mm1,        mm7
-        psubw       mm3,        mm4
-
-        pand        mm1,        mm3
-        pmaddwd     mm1,        mm1
-
-        paddd       mm1,        mm5
-        movq        mm3,        [rsi+16]
-
-        movq        mm4,        [rdi+16]
-        movq        mm5,        [rsi+24]
-
-        movq        mm6,        [rdi+24]
-        psubw       mm5,        mm6
-
-        pmaddwd     mm5,        mm5
-        psubw       mm3,        mm4
-
-        pmaddwd     mm3,        mm3
-        paddd       mm3,        mm5
-
-        paddd       mm1,        mm3
-        movq        mm0,        mm1
-
-        psrlq       mm1,        32
-        paddd       mm0,        mm1
-
-        movq        rax,        mm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
new file mode 100644
index 0000000..1126fdb
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
@@ -0,0 +1,74 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+;                         int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m6, m6                 ; ssz accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  pmaddwd   m2, m2
+  pmaddwd   m3, m3
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+  punpckldq m7, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m7
+  punpckldq m7, m2, m5
+  paddq     m4, m1
+  punpckhdq m2, m5
+  paddq     m6, m7
+  punpckldq m7, m3, m5
+  paddq     m6, m2
+  punpckhdq m3, m5
+  paddq     m6, m7
+  paddq     m6, m3
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  movhlps   m7, m6
+  paddq     m4, m5
+  paddq     m6, m7
+%if ARCH_X86_64
+  movq    rax, m4
+  movq [sszq], m6
+%else
+  mov     eax, sszm
+  pshufd   m5, m4, 0x1
+  movq  [eax], m6
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
diff --git a/libvpx/vp9/encoder/x86/vp9_fwalsh_sse2.asm b/libvpx/vp9/encoder/x86/vp9_fwalsh_sse2.asm
deleted file mode 100644
index 7bee9ef..0000000
--- a/libvpx/vp9/encoder/x86/vp9_fwalsh_sse2.asm
+++ /dev/null
@@ -1,164 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_walsh4x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_walsh4x4_sse2) PRIVATE
-sym(vp9_short_walsh4x4_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rsi, arg(0)           ; input
-    mov     rdi, arg(1)           ; output
-    movsxd  rdx, dword ptr arg(2) ; pitch
-
-    ; first for loop
-    movq    xmm0, MMWORD PTR [rsi]           ; load input
-    movq    xmm1, MMWORD PTR [rsi + rdx]
-    lea     rsi,  [rsi + rdx*2]
-    movq    xmm2, MMWORD PTR [rsi]
-    movq    xmm3, MMWORD PTR [rsi + rdx]
-
-    punpcklwd xmm0,  xmm1
-    punpcklwd xmm2,  xmm3
-
-    movdqa    xmm1, xmm0
-    punpckldq xmm0, xmm2           ; ip[1] ip[0]
-    punpckhdq xmm1, xmm2           ; ip[3] ip[2]
-
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1
-    psubw     xmm2, xmm1
-
-    psllw     xmm0, 2              ; d1  a1
-    psllw     xmm2, 2              ; c1  b1
-
-    movdqa    xmm1, xmm0
-    punpcklqdq xmm0, xmm2          ; b1  a1
-    punpckhqdq xmm1, xmm2          ; c1  d1
-
-    pxor      xmm6, xmm6
-    movq      xmm6, xmm0
-    pxor      xmm7, xmm7
-    pcmpeqw   xmm7, xmm6
-    paddw     xmm7, [GLOBAL(c1)]
-
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1           ; b1+c1  a1+d1
-    psubw     xmm2, xmm1           ; b1-c1  a1-d1
-    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
-
-    ; second for loop
-    ; input: 13  9  5  1 12  8  4  0 (xmm0)
-    ;        14 10  6  2 15 11  7  3 (xmm2)
-    ; after shuffle:
-    ;        13  5  9  1 12  4  8  0 (xmm0)
-    ;        14  6 10  2 15  7 11  3 (xmm1)
-    pshuflw   xmm3, xmm0, 0xd8
-    pshufhw   xmm0, xmm3, 0xd8
-    pshuflw   xmm3, xmm2, 0xd8
-    pshufhw   xmm1, xmm3, 0xd8
-
-    movdqa    xmm2, xmm0
-    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
-    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
-    movdqa    xmm3, xmm1
-    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
-    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
-
-    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
-    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
-    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
-    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
-
-    movdqa    xmm0, xmm4
-    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
-    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
-    movdqa    xmm1, xmm6
-    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
-    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
-
-    movdqa    xmm2, xmm0
-    paddd     xmm0, xmm4            ; b21 b20 a21 a20
-    psubd     xmm2, xmm4            ; c21 c20 d21 d20
-    movdqa    xmm3, xmm1
-    paddd     xmm1, xmm6            ; b23 b22 a23 a22
-    psubd     xmm3, xmm6            ; c23 c22 d23 d22
-
-    pxor      xmm4, xmm4
-    movdqa    xmm5, xmm4
-    pcmpgtd   xmm4, xmm0
-    pcmpgtd   xmm5, xmm2
-    pand      xmm4, [GLOBAL(cd1)]
-    pand      xmm5, [GLOBAL(cd1)]
-
-    pxor      xmm6, xmm6
-    movdqa    xmm7, xmm6
-    pcmpgtd   xmm6, xmm1
-    pcmpgtd   xmm7, xmm3
-    pand      xmm6, [GLOBAL(cd1)]
-    pand      xmm7, [GLOBAL(cd1)]
-
-    paddd     xmm0, xmm4
-    paddd     xmm2, xmm5
-    paddd     xmm0, [GLOBAL(cd3)]
-    paddd     xmm2, [GLOBAL(cd3)]
-    paddd     xmm1, xmm6
-    paddd     xmm3, xmm7
-    paddd     xmm1, [GLOBAL(cd3)]
-    paddd     xmm3, [GLOBAL(cd3)]
-
-    psrad     xmm0, 3
-    psrad     xmm1, 3
-    psrad     xmm2, 3
-    psrad     xmm3, 3
-    movdqa    xmm4, xmm0
-    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
-    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
-    movdqa    xmm5, xmm2
-    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
-    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
-
-    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
-    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
-
-    movdqa  XMMWORD PTR [rdi], xmm0
-    movdqa  XMMWORD PTR [rdi + 16], xmm2
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-c1:
-    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
-align 16
-cn1:
-    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
-align 16
-cd1:
-    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
-align 16
-cd3:
-    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm
new file mode 100644
index 0000000..60f7991
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -0,0 +1,214 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  movd                            m4, dword zbin_oqm       ; m4 = zbin_oq
+  mova                            m0, [zbinq]              ; m0 = zbin
+  punpcklwd                       m4, m4
+  mova                            m1, [roundq]             ; m1 = round
+  pshufd                          m4, m4, 0
+  mova                            m2, [quantq]             ; m2 = quant
+  paddw                           m0, m4                   ; m0 = zbin + zbin_oq
+  mova                            m3, [r2q]                ; m3 = dequant
+  psubw                           m0, [pw_1]
+  mov                             r2, shiftmp
+  mov                             r3, qcoeffmp
+  mova                            m4, [r2]                 ; m4 = shift
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+  pxor                            m5, m5                   ; m5 = dedicated zero
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+%ifidn %1, b_32x32
+  paddw                           m6, m6
+  paddw                          m11, m11
+%endif
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+  paddw                           m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddw                          m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+  mova        [qcoeffq+ncoeffq*2+ 0], m8
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m8
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+%ifidn %1, b_32x32
+  paddw                           m6, m6
+  paddw                          m11, m11
+%endif
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+  pmovmskb                        r6, m7
+  pmovmskb                        r2, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
+  paddw                           m6, m1                   ; m6 += round
+  paddw                          m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+  mova        [qcoeffq+ncoeffq*2+ 0], m14
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m14
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+  mova        [qcoeffq+ncoeffq*2+ 0], m5
+  mova        [qcoeffq+ncoeffq*2+16], m5
+  mova       [dqcoeffq+ncoeffq*2+ 0], m5
+  mova       [dqcoeffq+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                        [r2], m8, 0
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova       [dqcoeffq+ncoeffq*2+ 0], m7
+  mova       [dqcoeffq+ncoeffq*2+16], m7
+  mova        [qcoeffq+ncoeffq*2+ 0], m7
+  mova        [qcoeffq+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                    word [eobq], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 6
+QUANTIZE_FN b_32x32, 7
diff --git a/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm b/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm
index 8fb7d41..c4c5c54 100644
--- a/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm
+++ b/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm
@@ -12,12 +12,42 @@
 
 SECTION .text
 
-; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
-;                                uint8_t *ref, int ref_stride);
-%macro SAD64XN 1
-cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
+%macro SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
+;                                uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+  SAD_FN 64, %1, 5, %2
   mov              n_rowsd, %1
   pxor                  m0, m0
 .loop:
@@ -25,6 +55,13 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
   movu                  m2, [refq+16]
   movu                  m3, [refq+32]
   movu                  m4, [refq+48]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
   psadbw                m1, [srcq]
   psadbw                m2, [srcq+16]
   psadbw                m3, [srcq+32]
@@ -47,21 +84,27 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
 INIT_XMM sse2
 SAD64XN 64 ; sad64x64_sse2
 SAD64XN 32 ; sad64x32_sse2
+SAD64XN 64, 1 ; sad64x64_avg_sse2
+SAD64XN 32, 1 ; sad64x32_avg_sse2
 
 ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
-%macro SAD32XN 1
-cglobal sad32x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
+%macro SAD32XN 1-2 0
+  SAD_FN 32, %1, 5, %2
   mov              n_rowsd, %1/2
   pxor                  m0, m0
-
 .loop:
   movu                  m1, [refq]
   movu                  m2, [refq+16]
   movu                  m3, [refq+ref_strideq]
   movu                  m4, [refq+ref_strideq+16]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
   psadbw                m1, [srcq]
   psadbw                m2, [srcq+16]
   psadbw                m3, [srcq+src_strideq]
@@ -85,16 +128,14 @@ INIT_XMM sse2
 SAD32XN 64 ; sad32x64_sse2
 SAD32XN 32 ; sad32x32_sse2
 SAD32XN 16 ; sad32x16_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
 
 ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
-%macro SAD16XN 1
-cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
-                           src_stride3, ref_stride3, n_rows
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-  lea         src_stride3q, [src_strideq*3]
-  lea         ref_stride3q, [ref_strideq*3]
+%macro SAD16XN 1-2 0
+  SAD_FN 16, %1, 7, %2
   mov              n_rowsd, %1/4
   pxor                  m0, m0
 
@@ -103,6 +144,13 @@ cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
   movu                  m2, [refq+ref_strideq]
   movu                  m3, [refq+ref_strideq*2]
   movu                  m4, [refq+ref_stride3q]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
   psadbw                m1, [srcq]
   psadbw                m2, [srcq+src_strideq]
   psadbw                m3, [srcq+src_strideq*2]
@@ -126,16 +174,14 @@ INIT_XMM sse2
 SAD16XN 32 ; sad16x32_sse2
 SAD16XN 16 ; sad16x16_sse2
 SAD16XN  8 ; sad16x8_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN  8, 1 ; sad16x8_avg_sse2
 
 ; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
-%macro SAD8XN 1
-cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
-                          src_stride3, ref_stride3, n_rows
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-  lea         src_stride3q, [src_strideq*3]
-  lea         ref_stride3q, [ref_strideq*3]
+%macro SAD8XN 1-2 0
+  SAD_FN 8, %1, 7, %2
   mov              n_rowsd, %1/4
   pxor                  m0, m0
 
@@ -144,6 +190,11 @@ cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
   movhps                m1, [refq+ref_strideq]
   movh                  m2, [refq+ref_strideq*2]
   movhps                m2, [refq+ref_stride3q]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  lea         second_predq, [second_predq+mmsize*2]
+%endif
   movh                  m3, [srcq]
   movhps                m3, [srcq+src_strideq]
   movh                  m4, [srcq+src_strideq*2]
@@ -167,16 +218,14 @@ INIT_XMM sse2
 SAD8XN 16 ; sad8x16_sse2
 SAD8XN  8 ; sad8x8_sse2
 SAD8XN  4 ; sad8x4_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN  8, 1 ; sad8x8_avg_sse2
+SAD8XN  4, 1 ; sad8x4_avg_sse2
 
 ; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
 ;                                  uint8_t *ref, int ref_stride);
-%macro SAD4XN 1
-cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
-                          src_stride3, ref_stride3, n_rows
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-  lea         src_stride3q, [src_strideq*3]
-  lea         ref_stride3q, [ref_strideq*3]
+%macro SAD4XN 1-2 0
+  SAD_FN 4, %1, 7, %2
   mov              n_rowsd, %1/4
   pxor                  m0, m0
 
@@ -187,6 +236,11 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
   movd                  m4, [refq+ref_stride3q]
   punpckldq             m1, m2
   punpckldq             m3, m4
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m3, [second_predq+mmsize*1]
+  lea         second_predq, [second_predq+mmsize*2]
+%endif
   movd                  m2, [srcq]
   movd                  m5, [srcq+src_strideq]
   movd                  m4, [srcq+src_strideq*2]
@@ -209,3 +263,5 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
 INIT_MMX sse
 SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
+SAD4XN  8, 1 ; sad4x8_avg_sse
+SAD4XN  4, 1 ; sad4x4_avg_sse
diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
new file mode 100644
index 0000000..19e2feb
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -0,0 +1,1288 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times  8 dw  8
+bilin_filter_m_sse2: times  8 dw 16
+                     times  8 dw  0
+                     times  8 dw 15
+                     times  8 dw  1
+                     times  8 dw 14
+                     times  8 dw  2
+                     times  8 dw 13
+                     times  8 dw  3
+                     times  8 dw 12
+                     times  8 dw  4
+                     times  8 dw 11
+                     times  8 dw  5
+                     times  8 dw 10
+                     times  8 dw  6
+                     times  8 dw  9
+                     times  8 dw  7
+                     times 16 dw  8
+                     times  8 dw  7
+                     times  8 dw  9
+                     times  8 dw  6
+                     times  8 dw 10
+                     times  8 dw  5
+                     times  8 dw 11
+                     times  8 dw  4
+                     times  8 dw 12
+                     times  8 dw  3
+                     times  8 dw 13
+                     times  8 dw  2
+                     times  8 dw 14
+                     times  8 dw  1
+                     times  8 dw 15
+
+bilin_filter_m_ssse3: times  8 db 16,  0
+                      times  8 db 15,  1
+                      times  8 db 14,  2
+                      times  8 db 13,  3
+                      times  8 db 12,  4
+                      times  8 db 11,  5
+                      times  8 db 10,  6
+                      times  8 db  9,  7
+                      times 16 db  8
+                      times  8 db  7,  9
+                      times  8 db  6, 10
+                      times  8 db  5, 11
+                      times  8 db  4, 12
+                      times  8 db  3, 13
+                      times  8 db  2, 14
+                      times  8 db  1, 15
+
+SECTION .text
+
+; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+;                               int x_offset, int y_offset,
+;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+  psubw                %3, %4
+  psubw                %1, %2
+  paddw                %5, %3
+  pmaddwd              %3, %3
+  paddw                %5, %1
+  pmaddwd              %1, %1
+  paddd                %6, %3
+  paddd                %6, %1
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+  ; We have to sign-extend it before adding the words within the register
+  ; and outputing to a dword.
+  pcmpgtw              m5, m6           ; mask for 0 > x
+  movhlps              m3, m7
+  punpcklwd            m4, m6, m5
+  punpckhwd            m6, m5           ; sign-extend m6 word->dword
+  paddd                m7, m3
+  paddd                m6, m4
+  pshufd               m3, m7, 0x1
+  movhlps              m4, m6
+  paddd                m7, m3
+  paddd                m6, m4
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  pshufd               m4, m6, 0x1
+  movd               [r1], m7           ; store sse
+  paddd                m6, m4
+  movd                rax, m6           ; store sum as return value
+%else ; mmsize == 8
+  pshufw               m4, m6, 0xe
+  pshufw               m3, m7, 0xe
+  paddw                m6, m4
+  paddd                m7, m3
+  pcmpgtw              m5, m6           ; mask for 0 > x
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  punpcklwd            m6, m5           ; sign-extend m6 word->dword
+  movd               [r1], m7           ; store sse
+  pshufw               m4, m6, 0xe
+  paddd                m6, m4
+  movd                rax, m6           ; store sum as return value
+%endif
+  RET
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+%ifdef PIC
+%if %2 == 1 ; avg
+cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                              x_offset, y_offset, \
+                                              dst, dst_stride, \
+                                              sec, sec_stride, height, sse
+%define sec_str sec_strideq
+%else
+cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
+                                          dst, dst_stride, height, sse
+%endif
+%define h heightd
+%define bilin_filter sseq
+%else
+%if %2 == 1 ; avg
+cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+                                    7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+                                                         x_offset, y_offset, \
+                                                         dst, dst_stride, \
+                                                         sec, sec_stride, \
+                                                         height, sse
+%if ARCH_X86_64
+%define h heightd
+%define sec_str sec_strideq
+%else
+%define h dword heightm
+%define sec_str sec_stridemp
+%endif
+%else
+cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
+                                          dst, dst_stride, height, sse
+%define h heightd
+%endif
+%define bilin_filter bilin_filter_m
+%endif
+  ASSERT               %1 <= 16         ; m6 overflows if w > 16
+  pxor                 m6, m6           ; sum
+  pxor                 m7, m7           ; sse
+  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+  ; could perhaps use it for something more productive then
+  pxor                 m5, m5           ; dedicated zero register
+%if %1 < 16
+  sar                   h, 1
+%if %2 == 1 ; avg
+  shl             sec_str, 1
+%endif
+%endif
+
+  ; FIXME(rbultje) replace by jumptable?
+  test          x_offsetd, x_offsetd
+  jnz .x_nonzero
+  ; x_offset == 0
+  test          y_offsetd, y_offsetd
+  jnz .x_zero_y_nonzero
+
+  ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  mova                 m1, [dstq]
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%endif
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%if %2 == 0 ; !avg
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m0, [srcq+src_strideq]
+%else ; mmsize == 8
+  punpckldq            m0, [srcq+src_strideq]
+%endif
+%else ; !avg
+  movh                 m2, [srcq+src_strideq]
+%endif
+  movh                 m1, [dstq]
+  movh                 m3, [dstq+dst_strideq]
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_zero_y_zero_loop
+  STORE_AND_RET
+
+.x_zero_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_zero_y_nonhalf
+
+  ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+src_strideq]
+  mova                 m1, [dstq]
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m2, [srcq+src_strideq*2]
+%else ; mmsize == 8
+  punpckldq            m2, [srcq+src_strideq*2]
+%endif
+  movh                 m1, [dstq]
+%if mmsize == 16
+  movlhps              m0, m2
+%else ; mmsize == 8
+  punpckldq            m0, m2
+%endif
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
+  movh                 m4, [srcq+src_strideq*2]
+  movh                 m1, [dstq]
+  pavgb                m0, m2
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m2, m4
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_zero_y_half_loop
+  STORE_AND_RET
+
+.x_zero_y_nonhalf:
+  ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+.x_zero_y_other_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+src_strideq]
+  mova                 m1, [dstq]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  punpcklbw            m0, m5
+  punpcklbw            m4, m5
+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+  ; slightly faster because of pmullw latency. It would also cut our rodata
+  ; tables in half for this function, and save 1-2 registers on x86-64.
+  pmullw               m2, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m2, m3
+  paddw                m0, m4
+%endif
+  psraw                m2, 4
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m2, [srcq+src_strideq]
+  movh                 m4, [srcq+src_strideq*2]
+  movh                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  movh                 m1, [dstq]
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m1, m2, filter_y_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, m1
+  paddw                m2, filter_rnd
+  movh                 m1, [dstq]
+  paddw                m2, m4
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonzero:
+  cmp           x_offsetd, 8
+  jne .x_nonhalf
+  ; x_offset == 0.5
+  test          y_offsetd, y_offsetd
+  jnz .x_half_y_nonzero
+
+  ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m4, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m0, [srcq+src_strideq]
+  movhps               m4, [srcq+src_strideq+1]
+%else ; mmsize == 8
+  punpckldq            m0, [srcq+src_strideq]
+  punpckldq            m4, [srcq+src_strideq+1]
+%endif
+  movh                 m1, [dstq]
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+  pavgb                m0, [secq]
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
+  movh                 m2, [srcq+src_strideq]
+  movh                 m1, [dstq]
+  pavgb                m0, m4
+  movh                 m4, [srcq+src_strideq+1]
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m2, m4
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_half_y_zero_loop
+  STORE_AND_RET
+
+.x_half_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_half_y_nonhalf
+
+  ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_half_loop:
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m4, m3
+  punpckhbw            m3, m1, m5
+  pavgb                m0, m4
+%if %2 == 1 ; avg
+  punpcklbw            m1, m5
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_half_loop:
+  movh                 m2, [srcq]
+  movh                 m3, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m2, [srcq+src_strideq]
+  movhps               m3, [srcq+src_strideq+1]
+%else
+  punpckldq            m2, [srcq+src_strideq]
+  punpckldq            m3, [srcq+src_strideq+1]
+%endif
+  pavgb                m2, m3
+%if mmsize == 16
+  movlhps              m0, m2
+  movhlps              m4, m2
+%else ; mmsize == 8
+  punpckldq            m0, m2
+  pshufw               m4, m2, 0xe
+%endif
+  movh                 m1, [dstq]
+  pavgb                m0, m2
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
+  movh                 m4, [srcq+src_strideq]
+  movh                 m1, [srcq+src_strideq+1]
+  pavgb                m2, m3
+  pavgb                m4, m1
+  pavgb                m0, m2
+  pavgb                m2, m4
+  movh                 m1, [dstq]
+  movh                 m3, [dstq+dst_strideq]
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_half_y_half_loop
+  STORE_AND_RET
+
+.x_half_y_nonhalf:
+  ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_other_loop:
+  movu                 m4, [srcq]
+  movu                 m2, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m4, m2
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  pmullw               m2, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, filter_rnd
+  punpcklbw            m0, m5
+  paddw                m2, m3
+  punpcklbw            m3, m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  paddw                m0, m3
+%endif
+  punpckhbw            m3, m1, m5
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+%if notcpuflag(ssse3)
+  punpcklbw            m0, m5
+%endif
+.x_half_y_other_loop:
+  movh                 m2, [srcq]
+  movh                 m1, [srcq+1]
+  movh                 m4, [srcq+src_strideq]
+  movh                 m3, [srcq+src_strideq+1]
+  pavgb                m2, m1
+  pavgb                m4, m3
+  movh                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  movh                 m1, [dstq]
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m1, m2, filter_y_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  paddw                m0, m1
+  pmullw               m1, m4, filter_y_b
+  paddw                m2, filter_rnd
+  paddw                m2, m1
+  movh                 m1, [dstq]
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf:
+  test          y_offsetd, y_offsetd
+  jnz .x_nonhalf_y_nonzero
+
+  ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+.x_other_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+1]
+  mova                 m1, [dstq]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  punpcklbw            m0, m5
+  punpcklbw            m4, m5
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m2, m3
+  paddw                m0, m4
+%endif
+  psraw                m2, 4
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m1, [srcq+1]
+  movh                 m2, [srcq+src_strideq]
+  movh                 m4, [srcq+src_strideq+1]
+  movh                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  movh                 m1, [dstq]
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_x_a
+  pmaddubsw            m2, filter_x_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m0, m1
+  paddw                m2, filter_rnd
+  movh                 m1, [dstq]
+  paddw                m2, m4
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_nonhalf_y_nonhalf
+
+  ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m1
+  punpcklbw            m0, m1
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m1, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m0, m1
+  paddw                m2, m3
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+  add                srcq, src_strideq
+  packuswb             m0, m2
+.x_other_y_half_loop:
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+%if cpuflag(ssse3)
+  mova                 m1, [dstq]
+  punpckhbw            m2, m4, m3
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m4, m2
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%else
+  punpckhbw            m2, m4, m5
+  punpckhbw            m1, m3, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m4, m3
+  paddw                m2, m1
+  mova                 m1, [dstq]
+  psraw                m4, 4
+  psraw                m2, 4
+  punpckhbw            m3, m1, m5
+  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+  ; have a 1-register shortage to be able to store the backup of the bilin
+  ; filtered second line as words as cache for the next line. Packing into
+  ; a byte costs 1 pack and 2 unpacks, but saves a register.
+  packuswb             m4, m2
+  punpcklbw            m1, m5
+  pavgb                m0, m4
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  pavgb                m0, [secq]
+%endif
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  pmaddubsw            m0, filter_x_a
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m1
+%endif
+  add                srcq, src_strideq
+  psraw                m0, 4
+.x_other_y_half_loop:
+  movh                 m2, [srcq]
+  movh                 m1, [srcq+1]
+  movh                 m4, [srcq+src_strideq]
+  movh                 m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m2, m1
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  movh                 m1, [dstq]
+  movh                 m3, [dstq+dst_strideq]
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m1, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  paddw                m2, m1
+  movh                 m1, [dstq]
+  paddw                m4, m3
+  movh                 m3, [dstq+dst_strideq]
+%endif
+  psraw                m2, 4
+  psraw                m4, 4
+  pavgw                m0, m2
+  pavgw                m2, m4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline - also consider going to bytes here
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                m11, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else
+  add           x_offsetq, bilin_filter
+  add           y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+  ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m1
+  punpcklbw            m0, m1
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m1, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m0, m1
+  paddw                m2, m3
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+  add                srcq, src_strideq
+  packuswb             m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+  mova                 m1, [dstq]
+  punpckhbw            m2, m4, m3
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  punpckhbw            m3, m1, m5
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m4, m2
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  punpcklbw            m1, m5
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  psraw                m0, 4
+%else
+  movu                 m3, [srcq]
+  movu                 m4, [srcq+1]
+  punpckhbw            m1, m3, m5
+  punpckhbw            m2, m4, m5
+  punpcklbw            m3, m5
+  punpcklbw            m4, m5
+  pmullw               m3, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m3, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m3, m4
+  paddw                m1, m2
+  psraw                m3, 4
+  psraw                m1, 4
+  packuswb             m4, m3, m1
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  pmullw               m2, filter_y_a
+  pmullw               m1, filter_y_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, m1
+  mova                 m1, [dstq]
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  paddw                m0, m3
+  punpckhbw            m3, m1, m5
+  psraw                m0, 4
+  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  pmaddubsw            m0, filter_x_a
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m1
+%endif
+  psraw                m0, 4
+%if cpuflag(ssse3)
+  packuswb             m0, m0
+%endif
+  add                srcq, src_strideq
+.x_other_y_other_loop:
+  movh                 m2, [srcq]
+  movh                 m1, [srcq+1]
+  movh                 m4, [srcq+src_strideq]
+  movh                 m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m2, m1
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  movh                 m3, [dstq+dst_strideq]
+  movh                 m1, [dstq]
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m2, m2
+  packuswb             m4, m4
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+  psraw                m0, 4
+  psraw                m2, 4
+  punpcklbw            m1, m5
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m1, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  paddw                m2, m1
+  paddw                m4, m3
+  psraw                m2, 4
+  psraw                m4, 4
+  pmullw               m0, filter_y_a
+  pmullw               m3, m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  pmullw               m1, m4, filter_y_b
+  paddw                m2, filter_rnd
+  paddw                m0, m3
+  movh                 m3, [dstq+dst_strideq]
+  paddw                m2, m1
+  movh                 m1, [dstq]
+  psraw                m0, 4
+  psraw                m2, 4
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_MMX sse
+SUBPEL_VARIANCE  4
+INIT_XMM sse2
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE  4
+INIT_XMM ssse3
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_MMX sse
+SUBPEL_VARIANCE  4, 1
+INIT_XMM sse2
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE  4, 1
+INIT_XMM ssse3
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm b/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
index 8a2a471..2ecc23e 100644
--- a/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
+++ b/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
@@ -8,292 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-
 %include "vpx_ports/x86_abi_support.asm"
 
-%define xmm_filter_shift            7
-
-;void vp9_filter_block2d_bil_var_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
-sym(vp9_filter_block2d_bil_var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-        pxor            xmm6,           xmm6                 ;
-        pxor            xmm7,           xmm7                 ;
-
-        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
-        movdqa          xmm4,           XMMWORD PTR [rsi]
-
-        lea             rcx,            [GLOBAL(bilinear_filters_sse2)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              filter_block2d_bil_var_sse2_sp_only
-
-        shl             rax,            5                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              filter_block2d_bil_var_sse2_fp_only
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        movq            xmm3,           QWORD PTR [rsi+1]    ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]                ;
-        punpcklbw       xmm3,           xmm0
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift     ;
-        movdqa          xmm5,           xmm1
-
-        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
-        lea             rsi,            [rsi + rbx]
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-filter_block2d_bil_var_sse2_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        movq            xmm3,           QWORD PTR [rsi+1]             ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4               ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movdqa          xmm3,           xmm5                 ;
-        movdqa          xmm5,           xmm1                 ;
-
-        pmullw          xmm3,           [rdx]               ;
-        pmullw          xmm1,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_var_sse2_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
-        je              filter_block2d_bil_var_sse2_full_pixel
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        lea             rsi,            [rsi + rax]
-
-filter_block2d_bil_sp_only_loop:
-        movq            xmm3,           QWORD PTR [rsi]             ;
-        punpcklbw       xmm3,           xmm0                 ;
-        movdqa          xmm5,           xmm3
-
-        pmullw          xmm1,           [rdx]               ;
-        pmullw          xmm3,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        movdqa          xmm1,           xmm5                 ;
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_sp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0                 ;
-
-filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movq            xmm2,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm2,           xmm0                 ;
-
-        psubw           xmm1,           xmm2                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_full_pixel_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-
-filter_block2d_bil_fp_only_loop:
-        movq            xmm1,           QWORD PTR [rsi]       ;
-        movq            xmm3,           QWORD PTR [rsi+1]     ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4  ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]     ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-        lea             rsi,            [rsi + rdx]
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_fp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_variance:
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(7) ; sum
-        mov             rdi,            arg(8) ; sumsquared
-
-        movd            [rsi],          mm2    ; xsum
-        movd            [rdi],          mm4    ; xxsum
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
 ;void vp9_half_horiz_vert_variance16x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
@@ -619,27 +335,3 @@ sym(vp9_half_horiz_variance16x_h_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-SECTION_RODATA
-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-bilinear_filters_sse2:
-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
-    dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8
-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
-    dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
-    dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
-    dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
-    dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
-    dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
-    dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
diff --git a/libvpx/vp9/encoder/x86/vp9_subtract_mmx.asm b/libvpx/vp9/encoder/x86/vp9_subtract_mmx.asm
deleted file mode 100644
index e9eda4f..0000000
--- a/libvpx/vp9/encoder/x86/vp9_subtract_mmx.asm
+++ /dev/null
@@ -1,432 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-;                            short *diff, unsigned char *Predictor,
-;                            int pitch);
-global sym(vp9_subtract_b_mmx_impl) PRIVATE
-sym(vp9_subtract_b_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov     rdi,        arg(2) ;diff
-        mov     rax,        arg(3) ;Predictor
-        mov     rsi,        arg(0) ;z
-        movsxd  rdx,        dword ptr arg(1);src_stride;
-        movsxd  rcx,        dword ptr arg(4);pitch
-        pxor    mm7,        mm7
-
-        movd    mm0,        [rsi]
-        movd    mm1,        [rax]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi],      mm0
-
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*2],mm0
-
-
-        movd    mm0,        [rsi+rdx*2]
-        movd    mm1,        [rax+rcx*2]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*4],        mm0
-
-        lea     rsi,        [rsi+rdx*2]
-        lea     rcx,        [rcx+rcx*2]
-
-
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*2],        mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_mmx) PRIVATE
-sym(vp9_subtract_mby_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-
-            mov         rsi,            arg(1) ;src
-            mov         rdi,            arg(0) ;diff
-
-            mov         rax,            arg(2) ;pred
-            movsxd      rdx,            dword ptr arg(3) ;stride
-
-            mov         rcx,            16
-            pxor        mm0,            mm0
-
-.submby_loop:
-
-            movq        mm1,            [rsi]
-            movq        mm3,            [rax]
-
-            movq        mm2,            mm1
-            movq        mm4,            mm3
-
-            punpcklbw   mm1,            mm0
-            punpcklbw   mm3,            mm0
-
-            punpckhbw   mm2,            mm0
-            punpckhbw   mm4,            mm0
-
-            psubw       mm1,            mm3
-            psubw       mm2,            mm4
-
-            movq        [rdi],          mm1
-            movq        [rdi+8],        mm2
-
-
-            movq        mm1,            [rsi+8]
-            movq        mm3,            [rax+8]
-
-            movq        mm2,            mm1
-            movq        mm4,            mm3
-
-            punpcklbw   mm1,            mm0
-            punpcklbw   mm3,            mm0
-
-            punpckhbw   mm2,            mm0
-            punpckhbw   mm4,            mm0
-
-            psubw       mm1,            mm3
-            psubw       mm2,            mm4
-
-            movq        [rdi+16],       mm1
-            movq        [rdi+24],       mm2
-
-
-            add         rdi,            32
-            add         rax,            16
-
-            lea         rsi,            [rsi+rdx]
-
-            sub         rcx,            1
-            jnz         .submby_loop
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_mmx) PRIVATE
-sym(vp9_subtract_mbuv_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push rsi
-    push rdi
-    ; end prolog
-
-    ;short *udiff = diff + 256;
-    ;short *vdiff = diff + 320;
-    ;unsigned char *upred = pred + 256;
-    ;unsigned char *vpred = pred + 320;
-
-        ;unsigned char  *z    = usrc;
-        ;unsigned short *diff = udiff;
-        ;unsigned char  *Predictor= upred;
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(1) ;z = usrc
-            add     rdi,        256*2  ;diff = diff + 256 (shorts)
-            add     rax,        256    ;Predictor = pred + 256
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            pxor    mm7,        mm7
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-
-            add     rdi,        64
-            add     rax,        32
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-        ;unsigned char  *z    = vsrc;
-        ;unsigned short *diff = vdiff;
-        ;unsigned char  *Predictor= vpred;
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(2) ;z = usrc
-            add     rdi,        320*2  ;diff = diff + 320 (shorts)
-            add     rax,        320    ;Predictor = pred + 320
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            pxor    mm7,        mm7
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-
-            add     rdi,        64
-            add     rax,        32
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm b/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm
index 739d948..9824080 100644
--- a/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm
+++ b/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm
@@ -8,349 +8,120 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
-;                            short *diff, unsigned char *Predictor,
-;                            int pitch);
-global sym(vp9_subtract_b_sse2_impl) PRIVATE
-sym(vp9_subtract_b_sse2_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov     rdi,        arg(2) ;diff
-        mov     rax,        arg(3) ;Predictor
-        mov     rsi,        arg(0) ;z
-        movsxd  rdx,        dword ptr arg(1);src_stride;
-        movsxd  rcx,        dword ptr arg(4);pitch
-        pxor    mm7,        mm7
-
-        movd    mm0,        [rsi]
-        movd    mm1,        [rax]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi],      mm0
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*2], mm0
-
-        movd    mm0,        [rsi+rdx*2]
-        movd    mm1,        [rax+rcx*2]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*4], mm0
-
-        lea     rsi,        [rsi+rdx*2]
-        lea     rcx,        [rcx+rcx*2]
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*2], mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_sse2) PRIVATE
-sym(vp9_subtract_mby_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-            mov         rsi,            arg(1) ;src
-            mov         rdi,            arg(0) ;diff
-
-            mov         rax,            arg(2) ;pred
-            movsxd      rdx,            dword ptr arg(3) ;stride
-
-            mov         rcx,            8      ; do two lines at one time
-
-.submby_loop:
-            movdqa      xmm0,           XMMWORD PTR [rsi]   ; src
-            movdqa      xmm1,           XMMWORD PTR [rax]   ; pred
-
-            movdqa      xmm2,           xmm0
-            psubb       xmm0,           xmm1
-
-            pxor        xmm1,           [GLOBAL(t80)]   ;convert to signed values
-            pxor        xmm2,           [GLOBAL(t80)]
-            pcmpgtb     xmm1,           xmm2            ; obtain sign information
-
-            movdqa      xmm2,    xmm0
-            movdqa      xmm3,    xmm1
-            punpcklbw   xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw   xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa      XMMWORD PTR [rdi],   xmm0
-            movdqa      XMMWORD PTR [rdi +16], xmm2
-
-            movdqa      xmm4,           XMMWORD PTR [rsi + rdx]
-            movdqa      xmm5,           XMMWORD PTR [rax + 16]
-
-            movdqa      xmm6,           xmm4
-            psubb       xmm4,           xmm5
-
-            pxor        xmm5,           [GLOBAL(t80)]   ;convert to signed values
-            pxor        xmm6,           [GLOBAL(t80)]
-            pcmpgtb     xmm5,           xmm6            ; obtain sign information
-
-            movdqa      xmm6,    xmm4
-            movdqa      xmm7,    xmm5
-            punpcklbw   xmm4,    xmm5            ; put sign back to subtraction
-            punpckhbw   xmm6,    xmm7            ; put sign back to subtraction
-
-            movdqa      XMMWORD PTR [rdi +32], xmm4
-            movdqa      XMMWORD PTR [rdi +48], xmm6
-
-            add         rdi,            64
-            add         rax,            32
-            lea         rsi,            [rsi+rdx*2]
-
-            sub         rcx,            1
-            jnz         .submby_loop
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_sse2) PRIVATE
-sym(vp9_subtract_mbuv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(1) ;z = usrc
-            add     rdi,        256*2  ;diff = diff + 256 (shorts)
-            add     rax,        256    ;Predictor = pred + 256
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            lea     rcx,        [rdx + rdx*2]
-
-            ;u
-            ;line 0 1
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi],   xmm0
-            movdqa     XMMWORD PTR [rdi +16],   xmm2
-
-            ;line 2 3
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 32],   xmm0
-            movdqa     XMMWORD PTR [rdi + 48],   xmm2
-
-            ;line 4 5
-            lea        rsi,     [rsi + rdx*4]
-
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 64],   xmm0
-            movdqa     XMMWORD PTR [rdi + 80],   xmm2
-
-            ;line 6 7
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 96],   xmm0
-            movdqa     XMMWORD PTR [rdi + 112],  xmm2
-
-            ;v
-            mov     rsi,        arg(2) ;z = vsrc
-            add     rdi,        64*2  ;diff = diff + 320 (shorts)
-            add     rax,        64    ;Predictor = pred + 320
-
-            ;line 0 1
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi],   xmm0
-            movdqa     XMMWORD PTR [rdi +16],   xmm2
-
-            ;line 2 3
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 32],   xmm0
-            movdqa     XMMWORD PTR [rdi + 48],   xmm2
-
-            ;line 4 5
-            lea        rsi,     [rsi + rdx*4]
-
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 64],   xmm0
-            movdqa     XMMWORD PTR [rdi + 80],   xmm2
-
-            ;line 6 7
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 96],   xmm0
-            movdqa     XMMWORD PTR [rdi + 112],  xmm2
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-t80:
-    times 16 db 0x80
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void vp9_subtract_block(int rows, int cols,
+;                         int16_t *diff, ptrdiff_t diff_stride,
+;                         const uint8_t *src, ptrdiff_t src_stride,
+;                         const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+                        rows, cols, diff, diff_stride, src, src_stride, \
+                        pred, pred_stride
+%define pred_str colsq
+  pxor                  m7, m7         ; dedicated zero register
+  cmp                colsd, 4
+  je .case_4
+  cmp                colsd, 8
+  je .case_8
+  cmp                colsd, 16
+  je .case_16
+  cmp                colsd, 32
+  je .case_32
+
+%macro loop16 6
+  mova                  m0, [srcq+%1]
+  mova                  m4, [srcq+%2]
+  mova                  m1, [predq+%3]
+  mova                  m5, [predq+%4]
+  punpckhbw             m2, m0, m7
+  punpckhbw             m3, m1, m7
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  psubw                 m2, m3
+  psubw                 m0, m1
+  punpckhbw             m1, m4, m7
+  punpckhbw             m3, m5, m7
+  punpcklbw             m4, m7
+  punpcklbw             m5, m7
+  psubw                 m1, m3
+  psubw                 m4, m5
+  mova [diffq+mmsize*0+%5], m0
+  mova [diffq+mmsize*1+%5], m2
+  mova [diffq+mmsize*0+%6], m4
+  mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+  mov             pred_str, pred_stridemp
+.loop_64:
+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_64
+  RET
+
+.case_32:
+  mov             pred_str, pred_stridemp
+.loop_32:
+  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_32
+  RET
+
+.case_16:
+  mov             pred_str, pred_stridemp
+.loop_16:
+  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                predq, [predq+pred_str*2]
+  lea                 srcq, [srcq+src_strideq*2]
+  sub                rowsd, 2
+  jg .loop_16
+  RET
+
+%macro loop_h 0
+  movh                  m0, [srcq]
+  movh                  m2, [srcq+src_strideq]
+  movh                  m1, [predq]
+  movh                  m3, [predq+pred_str]
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  punpcklbw             m2, m7
+  punpcklbw             m3, m7
+  psubw                 m0, m1
+  psubw                 m2, m3
+  mova             [diffq], m0
+  mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+  mov             pred_str, pred_stridemp
+.loop_8:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_8
+  RET
+
+INIT_MMX
+.case_4:
+  mov             pred_str, pred_stridemp
+.loop_4:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_4
+  RET
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
index 9f140c9..d3dbefe 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ b/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
@@ -508,344 +508,3 @@ sym(vp9_get4x4sse_cs_mmx):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-%define mmx_filter_shift            7
-
-;void vp9_filter_block2d_bil4x4_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil4x4_var_mmx) PRIVATE
-sym(vp9_filter_block2d_bil4x4_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-
-        mov             rax,            arg(4) ;HFilter             ;
-        mov             rdx,            arg(5) ;VFilter             ;
-
-        mov             rsi,            arg(0) ;ref_ptr              ;
-        mov             rdi,            arg(2) ;src_ptr              ;
-
-        mov             rcx,            4                   ;
-        pxor            mm0,            mm0                 ;
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-%if ABI_IS_32BIT
-        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
-%else
-        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
-        add             rsi, r8
-%endif
-
-.filter_block2d_bil4x4_var_mmx_loop:
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm3,            mm5                 ;
-
-        movq            mm5,            mm1                 ;
-        pmullw          mm3,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        paddw           mm1,            mm3                 ;
-
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        movd            mm3,            [rdi]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        paddw           mm6,            mm1                 ;
-
-        pmaddwd         mm1,            mm1                 ;
-        paddd           mm7,            mm1                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
-
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(6) ;sum
-        mov             rsi,            arg(7) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
-;void vp9_filter_block2d_bil_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil_var_mmx) PRIVATE
-sym(vp9_filter_block2d_bil_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-        mov             rax,            arg(5) ;HFilter             ;
-
-        mov             rdx,            arg(6) ;VFilter             ;
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            mm0,            mm0                 ;
-        movq            mm1,            [rsi]               ;
-
-        movq            mm3,            [rsi+1]             ;
-        movq            mm2,            mm1                 ;
-
-        movq            mm4,            mm3                 ;
-        punpcklbw       mm1,            mm0                 ;
-
-        punpckhbw       mm2,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        pmullw          mm2,            [rax]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        punpckhbw       mm4,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        pmullw          mm4,            [rax+8]             ;
-        paddw           mm1,            mm3                 ;
-
-        paddw           mm2,            mm4                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm2,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-        packuswb        mm5,            mm2                 ;
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        add             rsi,            r8
-%endif
-
-.filter_block2d_bil_var_mmx_loop:
-
-        movq            mm1,            [rsi]               ;
-        movq            mm3,            [rsi+1]             ;
-
-        movq            mm2,            mm1                 ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm1,            mm0                 ;
-        punpckhbw       mm2,            mm0                 ;
-
-        pmullw          mm1,            [rax]               ;
-        pmullw          mm2,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        pmullw          mm3,            [rax+8]             ;
-        pmullw          mm4,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            mm5                 ;
-        movq            mm4,            mm5                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        movq            mm5,            mm1                 ;
-        packuswb        mm5,            mm2                 ;
-
-        pmullw          mm3,            [rdx]               ;
-        pmullw          mm4,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        pmullw          mm2,            [rdx+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            [rdi]               ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        psubw           mm2,            mm4                 ;
-
-        paddw           mm6,            mm1                 ;
-        pmaddwd         mm1,            mm1                 ;
-
-        paddw           mm6,            mm2                 ;
-        pmaddwd         mm2,            mm2                 ;
-
-        paddd           mm7,            mm1                 ;
-        paddd           mm7,            mm2                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil_var_mmx_loop       ;
-
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(7) ;sum
-        mov             rsi,            arg(8) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-;short mmx_bi_rd[4] = { 64, 64, 64, 64};
-align 16
-mmx_bi_rd:
-    times 4 dw 64
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm
index 896dd18..2c50881 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ b/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm
@@ -11,8 +11,6 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-%define xmm_filter_shift            7
-
 ;unsigned int vp9_get_mb_ss_sse2
 ;(
 ;    short *src_ptr
@@ -734,28 +732,3 @@ sym(vp9_half_horiz_variance8x_h_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-
-SECTION_RODATA
-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-bilinear_filters_sse2:
-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
-    dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8
-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
-    dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
-    dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
-    dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
-    dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
-    dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
-    dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_ssse3.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_ssse3.asm
deleted file mode 100644
index 98a4a16..0000000
--- a/libvpx/vp9/encoder/x86/vp9_variance_impl_ssse3.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift            7
-
-
-;void vp9_filter_block2d_bil_var_ssse3
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-;Note: The filter coefficient at offset=0 is 128. Since the second register
-;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
-global sym(vp9_filter_block2d_bil_var_ssse3) PRIVATE
-sym(vp9_filter_block2d_bil_var_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6
-        pxor            xmm7,           xmm7
-
-        lea             rcx,            [GLOBAL(bilinear_filters_ssse3)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              .filter_block2d_bil_var_ssse3_sp_only
-
-        shl             rax,            4                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              .filter_block2d_bil_var_ssse3_fp_only
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        movdqu          xmm0,           XMMWORD PTR [rsi]
-        movdqu          xmm1,           XMMWORD PTR [rsi+1]
-        movdqa          xmm2,           xmm0
-
-        punpcklbw       xmm0,           xmm1
-        punpckhbw       xmm2,           xmm1
-        pmaddubsw       xmm0,           [rax]
-        pmaddubsw       xmm2,           [rax]
-
-        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm0,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        packuswb        xmm0,           xmm2
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        lea             rsi,            [rsi + r8]
-%endif
-
-.filter_block2d_bil_var_ssse3_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-        packuswb        xmm1,           xmm3
-
-        movdqa          xmm2,           xmm0
-        movdqa          xmm0,           xmm1
-        movdqa          xmm3,           xmm2
-
-        punpcklbw       xmm2,           xmm1
-        punpckhbw       xmm3,           xmm1
-        pmaddubsw       xmm2,           [rdx]
-        pmaddubsw       xmm3,           [rdx]
-
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm2,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm1,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm1,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm2,           xmm1
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm2
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm2,           xmm2
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm2
-        paddd           xmm7,           xmm3
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rsi,            [rsi + r8]
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_var_ssse3_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
-        je              .filter_block2d_bil_var_ssse3_full_pixel
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqa          xmm0,           xmm1
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        lea             rsi,            [rsi + rax]
-
-.filter_block2d_bil_sp_only_loop:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-        movdqa          xmm2,           xmm1
-        movdqa          xmm0,           xmm3
-
-        punpcklbw       xmm1,           xmm3
-        punpckhbw       xmm2,           xmm3
-        pmaddubsw       xmm1,           [rdx]
-        pmaddubsw       xmm2,           [rdx]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        movq            xmm3,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm3,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        movdqa          xmm1,           xmm0
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_sp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0
-
-.filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]
-        punpcklbw       xmm1,           xmm0
-        movq            xmm2,           QWORD PTR [rsi+8]
-        punpcklbw       xmm2,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]
-        punpcklbw       xmm3,           xmm0
-        movq            xmm4,           QWORD PTR [rdi+8]
-        punpcklbw       xmm4,           xmm0
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm4
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_full_pixel_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-.filter_block2d_bil_fp_only_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm2,           XMMWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm2,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm2
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm3
-
-        lea             rsi,            [rsi + rdx]
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_fp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_variance:
-        pxor        xmm0,           xmm0
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(7) ;[Sum]
-        mov         rdi,            arg(8) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-bilinear_filters_ssse3:
-    times 8 db 128, 0
-    times 8 db 120, 8
-    times 8 db 112, 16
-    times 8 db 104, 24
-    times 8 db  96, 32
-    times 8 db  88, 40
-    times 8 db  80, 48
-    times 8 db  72, 56
-    times 8 db  64, 64
-    times 8 db  56, 72
-    times 8 db  48, 80
-    times 8 db  40, 88
-    times 8 db  32, 96
-    times 8 db  24, 104
-    times 8 db  16, 112
-    times 8 db   8, 120
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
index bad1cfa..d141560 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
@@ -13,27 +13,6 @@
 #include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
-extern void filter_block1d_h6_mmx
-(
-  const unsigned char *src_ptr,
-  unsigned short *output_ptr,
-  unsigned int src_pixels_per_line,
-  unsigned int pixel_step,
-  unsigned int output_height,
-  unsigned int output_width,
-  short *vp7_filter
-);
-extern void filter_block1d_v6_mmx
-(
-  const short *src_ptr,
-  unsigned char *output_ptr,
-  unsigned int pixels_per_line,
-  unsigned int pixel_step,
-  unsigned int output_height,
-  unsigned int output_width,
-  short *vp7_filter
-);
-
 extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
 extern unsigned int vp9_get8x8var_mmx
 (
@@ -53,30 +32,6 @@ extern unsigned int vp9_get4x4var_mmx
   unsigned int *SSE,
   int *Sum
 );
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  const short *HFilter,
-  const short *VFilter,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_mmx
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  const short *HFilter,
-  const short *VFilter,
-  int *sum,
-  unsigned int *sumsquared
-);
-
 
 unsigned int vp9_variance4x4_mmx(
   const unsigned char *src_ptr,
@@ -190,193 +145,3 @@ unsigned int vp9_variance8x16_mmx(
   return (var - (((unsigned int)avg * avg) >> 7));
 
 }
-
-DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
-
-unsigned int vp9_sub_pixel_variance4x4_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse)
-
-{
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil4x4_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-unsigned int vp9_sub_pixel_variance8x8_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 8,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-unsigned int vp9_sub_pixel_variance16x16_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum0, &xxsum0
-  );
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr + 8, src_pixels_per_line,
-    dst_ptr + 8, dst_pixels_per_line, 16,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum1, &xxsum1
-  );
-
-  xsum0 += xsum1;
-  xxsum0 += xxsum1;
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-
-
-}
-
-unsigned int vp9_sub_pixel_mse16x16_mmx(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-  return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 8,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum0, &xxsum0
-  );
-
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr + 8, src_pixels_per_line,
-    dst_ptr + 8, dst_pixels_per_line, 8,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum1, &xxsum1
-  );
-
-  xsum0 += xsum1;
-  xxsum0 += xxsum1;
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_h_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_hv_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8,
-                                         ref_ptr, recon_stride, sse);
-}
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
index 67ca925..b4ff850 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
@@ -9,29 +9,11 @@
  */
 
 #include "vpx_config.h"
+
 #include "vp9/encoder/vp9_variance.h"
 #include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
-#define HALFNDX 8
-
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  const short *HFilter,
-  const short *VFilter,
-  int *sum,
-  unsigned int *sumsquared
-);
-
 extern unsigned int vp9_get4x4var_mmx
 (
   const unsigned char *src_ptr,
@@ -64,18 +46,6 @@ unsigned int vp9_get8x8var_sse2
   unsigned int *SSE,
   int *Sum
 );
-void vp9_filter_block2d_bil_var_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int  xoffset,
-  int  yoffset,
-  int *sum,
-  unsigned int *sumsquared
-);
 void vp9_half_horiz_vert_variance8x_h_sse2
 (
   const unsigned char *ref_ptr,
@@ -137,8 +107,6 @@ void vp9_half_vert_variance16x_h_sse2
   unsigned int *sumsquared
 );
 
-DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
-
 typedef unsigned int (*get_var_sse2) (
   const unsigned char *src_ptr,
   int source_stride,
@@ -375,347 +343,162 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
   return (var - (((int64_t)avg * avg) >> 11));
 }
 
-unsigned int vp9_sub_pixel_variance4x4_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil4x4_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 4));
+#define DECL(w, opt) \
+int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
+                                        ptrdiff_t src_stride, \
+                                        int x_offset, int y_offset, \
+                                        const uint8_t *dst, \
+                                        ptrdiff_t dst_stride, \
+                                        int height, unsigned int *sse)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                     int src_stride, \
+                                                     int x_offset, \
+                                                     int y_offset, \
+                                                     const uint8_t *dst, \
+                                                     int dst_stride, \
+                                                     unsigned int *sse_ptr) { \
+  unsigned int sse; \
+  int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                y_offset, dst, dst_stride, \
+                                                h, &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                   x_offset, y_offset, \
+                                                   dst + 16, dst_stride, \
+                                                   h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 32, dst_stride, \
+                                                 h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 48, dst_stride, \
+                                                 h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 }
 
-
-unsigned int vp9_sub_pixel_variance8x8_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum, &xxsum);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum, &xxsum);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum, &xxsum);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum, &xxsum);
-  }
-
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
-                                         int src_pixels_per_line,
-                                         int xoffset,
-                                         int yoffset,
-                                         const uint8_t *dst_ptr,
-                                         int dst_pixels_per_line,
-                                         unsigned int *sse, int *avg) {
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-  // note we could avoid these if statements if the calling function
-  // just called the appropriate functions inside.
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum0, &xxsum0
-    );
-
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr + 8, src_pixels_per_line,
-      dst_ptr + 8, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum1, &xxsum1
-    );
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-  }
-
-  *sse = xxsum0;
-  *avg = xsum0;
-}
-
-unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
-                                              int src_pixels_per_line,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst_ptr,
-                                              int dst_pixels_per_line,
-                                              unsigned int *sse_ptr) {
-  int avg;
-  unsigned int sse;
-
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse, &avg);
-  *sse_ptr = sse;
-
-  return (sse - (((unsigned int) avg * avg) >> 8));
-}
-
-unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr,
-                                              int src_pixels_per_line,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst_ptr,
-                                              int dst_pixels_per_line,
-                                              unsigned int *sse_ptr) {
-  int avg0, avg1, avg2, avg3;
-  unsigned int sse0, sse1, sse2, sse3;
-
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse0, &avg0);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse1, &avg1);
-  src_ptr += 16 * src_pixels_per_line;
-  dst_ptr += 16 * dst_pixels_per_line;
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse3, &avg3);
-  sse0 += sse1 + sse2 + sse3;
-  avg0 += avg1 + avg2 + avg3;
-  *sse_ptr = sse0;
-
-  return (sse0 - (((unsigned int) avg0 * avg0) >> 10));
-}
-
-unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr,
-                                              int src_pixels_per_line,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst_ptr,
-                                              int dst_pixels_per_line,
-                                              unsigned int *sse_ptr) {
-  int avg0, avg1, avg2, avg3, avg4;
-  unsigned int sse0, sse1, sse2, sse3, sse4;
-
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse0, &avg0);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse1, &avg1);
-  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 32, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 48, dst_pixels_per_line,
-                               &sse3, &avg3);
-  src_ptr += 16 * src_pixels_per_line;
-  dst_ptr += 16 * dst_pixels_per_line;
-  avg0 += avg1 + avg2 + avg3;
-  sse0 += sse1 + sse2 + sse3;
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse1, &avg1);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 32, dst_pixels_per_line,
-                               &sse3, &avg3);
-  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 48, dst_pixels_per_line,
-                               &sse4, &avg4);
-  src_ptr += 16 * src_pixels_per_line;
-  dst_ptr += 16 * dst_pixels_per_line;
-  avg0 += avg1 + avg2 + avg3 + avg4;
-  sse0 += sse1 + sse2 + sse3 + sse4;
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse1, &avg1);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 32, dst_pixels_per_line,
-                               &sse3, &avg3);
-  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 48, dst_pixels_per_line,
-                               &sse4, &avg4);
-  src_ptr += 16 * src_pixels_per_line;
-  dst_ptr += 16 * dst_pixels_per_line;
-  avg0 += avg1 + avg2 + avg3 + avg4;
-  sse0 += sse1 + sse2 + sse3 + sse4;
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse1, &avg1);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 32, dst_pixels_per_line,
-                               &sse3, &avg3);
-  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 48, dst_pixels_per_line,
-                               &sse4, &avg4);
-  avg0 += avg1 + avg2 + avg3 + avg4;
-  sse0 += sse1 + sse2 + sse3 + sse4;
-  *sse_ptr = sse0;
-
-  return (sse0 - (((unsigned int) avg0 * avg0) >> 12));
-}
-
-unsigned int vp9_sub_pixel_mse16x16_sse2(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                                   yoffset, dst_ptr, dst_pixels_per_line, sse);
-  return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-
-) {
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum0, &xxsum0);
-
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr + 8, src_pixels_per_line,
-      dst_ptr + 8, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum1, &xxsum1);
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum, &xxsum);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum, &xxsum);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum, &xxsum);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum, &xxsum);
-  }
-
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 7));
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
+FN(16,  8, 16, 4, 3, opt1,); \
+FN(8,  16,  8, 3, 4, opt1,); \
+FN(8,   8,  8, 3, 3, opt1,); \
+FN(8,   4,  8, 3, 2, opt1,); \
+FN(4,   8,  4, 2, 3, opt2,); \
+FN(4,   4,  4, 2, 2, opt2,)
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+
+#define DECL(w, opt) \
+int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
+                                            ptrdiff_t src_stride, \
+                                            int x_offset, int y_offset, \
+                                            const uint8_t *dst, \
+                                            ptrdiff_t dst_stride, \
+                                            const uint8_t *sec, \
+                                            ptrdiff_t sec_stride, \
+                                            int height, unsigned int *sse)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                         int src_stride, \
+                                                         int x_offset, \
+                                                         int y_offset, \
+                                                         const uint8_t *dst, \
+                                                         int dst_stride, \
+                                                         unsigned int *sseptr, \
+                                                         const uint8_t *sec) { \
+  unsigned int sse; \
+  int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                    y_offset, dst, dst_stride, \
+                                                    sec, w, h, &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst + 16, dst_stride, \
+                                                       sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 32, dst_stride, \
+                                                     sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 48, dst_stride, \
+                                                     sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sseptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 }
 
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
+FN(16,  8, 16, 4, 3, opt1,); \
+FN(8,  16,  8, 3, 4, opt1,); \
+FN(8,   8,  8, 3, 3, opt1,); \
+FN(8,   4,  8, 3, 2, opt1,); \
+FN(4,   8,  4, 2, 3, opt2,); \
+FN(4,   4,  4, 2, 2, opt2,)
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
 
 unsigned int vp9_variance_halfpixvar16x16_h_wmt(
   const unsigned char *src_ptr,
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_ssse3.c b/libvpx/vp9/encoder/x86/vp9_variance_ssse3.c
deleted file mode 100644
index 882acad..0000000
--- a/libvpx/vp9/encoder/x86/vp9_variance_ssse3.c
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_pragmas.h"
-#include "vpx_ports/mem.h"
-
-#define HALFNDX 8
-
-extern void vp9_half_horiz_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_half_horiz_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_half_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_ssse3
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int  xoffset,
-  int  yoffset,
-  int *sum,
-  unsigned int *sumsquared
-);
-
-unsigned int vp9_sub_pixel_variance16x16_ssse3
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  // note we could avoid these if statements if the calling function
-  // just called the appropriate functions inside.
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_ssse3(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum0, &xxsum0);
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp9_sub_pixel_variance16x8_ssse3
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-
-) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_ssse3(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum0, &xxsum0);
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
diff --git a/libvpx/vp9/encoder/x86/vp9_x86_csystemdependent.c b/libvpx/vp9/encoder/x86/vp9_x86_csystemdependent.c
deleted file mode 100644
index 6016e14..0000000
--- a/libvpx/vp9/encoder/x86/vp9_x86_csystemdependent.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "vpx_ports/x86.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/x86/vp9_dct_mmx.h"
-
-// TODO(jimbankoski) Consider rewriting the c to take the same values rather
-// than going through these pointer conversions
-#if 0 && HAVE_MMX
-void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
-  vp9_short_fdct4x4_mmx(input,   output,    pitch);
-  vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
-}
-
-void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-                             short *diff, unsigned char *predictor,
-                             int pitch);
-void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {
-  unsigned char *z = *(be->base_src) + be->src;
-  unsigned int  src_stride = be->src_stride;
-  short *diff = &be->src_diff[0];
-  unsigned char *predictor = *(bd->base_dst) + bd->dst;
-  // TODO(jingning): The prototype function in c has been changed. Need to
-  // modify the mmx and sse versions.
-  vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
-
-#if 0 && HAVE_SSE2
-void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
-                              short *diff, unsigned char *predictor,
-                              int pitch);
-void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) {
-  unsigned char *z = *(be->base_src) + be->src;
-  unsigned int  src_stride = be->src_stride;
-  short *diff = &be->src_diff[0];
-  unsigned char *predictor = *(bd->base_dst) + bd->dst;
-  // TODO(jingning): The prototype function in c has been changed. Need to
-  // modify the mmx and sse versions.
-  vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk
index 7a74833..5a0c1c9 100644
--- a/libvpx/vp9/vp9_common.mk
+++ b/libvpx/vp9/vp9_common.mk
@@ -14,7 +14,6 @@ VP9_COMMON_SRCS-yes += common/vp9_pragmas.h
 VP9_COMMON_SRCS-yes += common/vp9_ppflags.h
 VP9_COMMON_SRCS-yes += common/vp9_onyx.h
 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
-VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c
 VP9_COMMON_SRCS-yes += common/vp9_convolve.c
 VP9_COMMON_SRCS-yes += common/vp9_convolve.h
 VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c
@@ -39,7 +38,6 @@ VP9_COMMON_SRCS-yes += common/vp9_extend.h
 VP9_COMMON_SRCS-yes += common/vp9_findnearmv.h
 VP9_COMMON_SRCS-yes += common/vp9_idct.h
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
-VP9_COMMON_SRCS-yes += common/vp9_modecont.h
 VP9_COMMON_SRCS-yes += common/vp9_mv.h
 VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h
 VP9_COMMON_SRCS-yes += common/vp9_pred_common.h
@@ -60,9 +58,6 @@ VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_treecoder.h
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c
-VP9_COMMON_SRCS-yes += common/vp9_mbpitch.c
-VP9_COMMON_SRCS-yes += common/vp9_modecont.c
-VP9_COMMON_SRCS-yes += common/vp9_modecontext.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
 VP9_COMMON_SRCS-yes += common/vp9_quant_common.c
@@ -70,37 +65,31 @@ VP9_COMMON_SRCS-yes += common/vp9_reconinter.c
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.c
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c
 VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
-VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c
+VP9_COMMON_SRCS-yes += common/vp9_common_data.c
+VP9_COMMON_SRCS-yes += common/vp9_common_data.h
 
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
 ifeq ($(CONFIG_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif
 
-# common (c)
-ifeq ($(CONFIG_CSM),yes)
-VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c
-VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm
-endif
-
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c
 
-$(eval $(call asm_offsets_template,\
-         vp9_asm_com_offsets.asm, $(VP9_PREFIX)common/vp9_asm_com_offsets.c))
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
 
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh))
diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c
index e5b5089..be7828f 100644
--- a/libvpx/vp9/vp9_cx_iface.c
+++ b/libvpx/vp9/vp9_cx_iface.c
@@ -233,10 +233,10 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   oxcf->width   = cfg.g_w;
   oxcf->height  = cfg.g_h;
   /* guess a frame rate if out of whack, use 30 */
-  oxcf->frame_rate             = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
+  oxcf->framerate             = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
 
-  if (oxcf->frame_rate > 180) {
-    oxcf->frame_rate = 30;
+  if (oxcf->framerate > 180) {
+    oxcf->framerate = 30;
   }
 
   switch (cfg.g_pass) {
@@ -1032,6 +1032,7 @@ static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = {
   {VP8E_SET_CQ_LEVEL,                 set_param},
   {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_param},
   {VP9E_SET_LOSSLESS,                 set_param},
+  {VP9E_SET_FRAME_PARALLEL_DECODING,  set_param},
   {VP9_GET_REFERENCE,                 get_reference},
   { -1, NULL},
 };
diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c
index ea6946b..05029b9 100644
--- a/libvpx/vp9/vp9_dx_iface.c
+++ b/libvpx/vp9/vp9_dx_iface.c
@@ -19,36 +19,29 @@
 #include "decoder/vp9_onyxd_int.h"
 #include "vp9/vp9_iface_common.h"
 
-#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
-typedef vpx_codec_stream_info_t  vp8_stream_info_t;
+#define VP9_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+typedef vpx_codec_stream_info_t  vp9_stream_info_t;
 
 /* Structures for handling memory allocations */
 typedef enum {
-  VP8_SEG_ALG_PRIV     = 256,
-  VP8_SEG_MAX
+  VP9_SEG_ALG_PRIV     = 256,
+  VP9_SEG_MAX
 } mem_seg_id_t;
 #define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
 
-static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);
+static unsigned long priv_sz(const vpx_codec_dec_cfg_t *si,
+                             vpx_codec_flags_t flags);
 
-typedef struct {
-  unsigned int   id;
-  unsigned long  sz;
-  unsigned int   align;
-  unsigned int   flags;
-  unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
-} mem_req_t;
-
-static const mem_req_t vp8_mem_req_segs[] = {
-  {VP8_SEG_ALG_PRIV,    0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
-  {VP8_SEG_MAX, 0, 0, 0, NULL}
+static const mem_req_t vp9_mem_req_segs[] = {
+  {VP9_SEG_ALG_PRIV, 0, 8, VPX_CODEC_MEM_ZERO, priv_sz},
+  {VP9_SEG_MAX, 0, 0, 0, NULL}
 };
 
 struct vpx_codec_alg_priv {
   vpx_codec_priv_t        base;
-  vpx_codec_mmap_t        mmaps[NELEMENTS(vp8_mem_req_segs) - 1];
+  vpx_codec_mmap_t        mmaps[NELEMENTS(vp9_mem_req_segs) - 1];
   vpx_codec_dec_cfg_t     cfg;
-  vp8_stream_info_t       si;
+  vp9_stream_info_t       si;
   int                     defer_alloc;
   int                     decoder_init;
   VP9D_PTR                pbi;
@@ -67,8 +60,8 @@ struct vpx_codec_alg_priv {
   int                     invert_tile_order;
 };
 
-static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si,
-                                 vpx_codec_flags_t flags) {
+static unsigned long priv_sz(const vpx_codec_dec_cfg_t *si,
+                             vpx_codec_flags_t flags) {
   /* Although this declaration is constant, we can't use it in the requested
    * segments list because we want to define the requested segments list
    * before defining the private type (so that the number of memory maps is
@@ -78,59 +71,7 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si,
   return sizeof(vpx_codec_alg_priv_t);
 }
 
-
-static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap) {
-  free(mmap->priv);
-}
-
-static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap) {
-  vpx_codec_err_t  res;
-  unsigned int   align;
-
-  align = mmap->align ? mmap->align - 1 : 0;
-
-  if (mmap->flags & VPX_CODEC_MEM_ZERO)
-    mmap->priv = calloc(1, mmap->sz + align);
-  else
-    mmap->priv = malloc(mmap->sz + align);
-
-  res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;
-  mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
-  mmap->dtor = vp8_mmap_dtor;
-  return res;
-}
-
-static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
-                                          const vpx_codec_mmap_t *mmaps,
-                                          vpx_codec_flags_t init_flags) {
-  int i;
-  vpx_codec_err_t res = VPX_CODEC_OK;
-
-  for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++) {
-    /* Ensure the segment has been allocated */
-    if (!mmaps[i].base) {
-      res = VPX_CODEC_MEM_ERROR;
-      break;
-    }
-
-    /* Verify variable size segment is big enough for the current si. */
-    if (vp8_mem_req_segs[i].calc_sz) {
-      vpx_codec_dec_cfg_t cfg;
-
-      cfg.w = si->w;
-      cfg.h = si->h;
-
-      if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags)) {
-        res = VPX_CODEC_MEM_ERROR;
-        break;
-      }
-    }
-  }
-
-  return res;
-}
-
-static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {
+static void vp9_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {
   int i;
 
   ctx->priv = mmap->base;
@@ -139,7 +80,7 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {
   ctx->priv->alg_priv = mmap->base;
 
   for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
-    ctx->priv->alg_priv->mmaps[i].id = vp8_mem_req_segs[i].id;
+    ctx->priv->alg_priv->mmaps[i].id = vp9_mem_req_segs[i].id;
 
   ctx->priv->alg_priv->mmaps[0] = *mmap;
   ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
@@ -152,20 +93,11 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {
   }
 }
 
-static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id) {
-  int i;
-
-  for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
-    if (ctx->mmaps[i].id == id)
-      return ctx->mmaps[i].base;
-
-  return NULL;
-}
-static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) {
+static void vp9_finalize_mmaps(vpx_codec_alg_priv_t *ctx) {
   /* nothing to clean up */
 }
 
-static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
+static vpx_codec_err_t vp9_init(vpx_codec_ctx_t *ctx,
                                 vpx_codec_priv_enc_mr_cfg_t *data) {
   vpx_codec_err_t        res = VPX_CODEC_OK;
 
@@ -176,15 +108,15 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
   if (!ctx->priv) {
     vpx_codec_mmap_t mmap;
 
-    mmap.id = vp8_mem_req_segs[0].id;
+    mmap.id = vp9_mem_req_segs[0].id;
     mmap.sz = sizeof(vpx_codec_alg_priv_t);
-    mmap.align = vp8_mem_req_segs[0].align;
-    mmap.flags = vp8_mem_req_segs[0].flags;
+    mmap.align = vp9_mem_req_segs[0].align;
+    mmap.flags = vp9_mem_req_segs[0].flags;
 
-    res = vp8_mmap_alloc(&mmap);
+    res = vpx_mmap_alloc(&mmap);
 
     if (!res) {
-      vp8_init_ctx(ctx, &mmap);
+      vp9_init_ctx(ctx, &mmap);
 
       ctx->priv->alg_priv->defer_alloc = 1;
       /*post processing level initialized to do nothing */
@@ -194,7 +126,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
   return res;
 }
 
-static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) {
+static vpx_codec_err_t vp9_destroy(vpx_codec_alg_priv_t *ctx) {
   int i;
 
   vp9_remove_decompressor(ctx->pbi);
@@ -207,43 +139,44 @@ static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) {
   return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
+static vpx_codec_err_t vp9_peek_si(const uint8_t         *data,
                                    unsigned int           data_sz,
                                    vpx_codec_stream_info_t *si) {
   vpx_codec_err_t res = VPX_CODEC_OK;
 
-  if (data + data_sz <= data)
+  if (data_sz <= 8) return VPX_CODEC_UNSUP_BITSTREAM;
+
+  if (data + data_sz <= data) {
     res = VPX_CODEC_INVALID_PARAM;
-  else {
-    si->is_kf = 0;
+  } else {
+    const int frame_marker = (data[0] >> 6) & 0x3;
+    const int version = (data[0] >> 4) & 0x3;
+    if (frame_marker != 0x2) return VPX_CODEC_UNSUP_BITSTREAM;
+    if (version != 0) return VPX_CODEC_UNSUP_BITSTREAM;
 
-    if (data_sz >= 8 && (data[0] & 0xD8) == 0x80) { /* I-Frame */
+    si->is_kf = !((data[0] >> 2) & 0x1);
+    if (si->is_kf) {
       const uint8_t *c = data + 1;
-      si->is_kf = 1;
 
       if (c[0] != SYNC_CODE_0 || c[1] != SYNC_CODE_1 || c[2] != SYNC_CODE_2)
-        res = VPX_CODEC_UNSUP_BITSTREAM;
+        return VPX_CODEC_UNSUP_BITSTREAM;
 
-      si->w = (c[3] << 8) | c[4];
-      si->h = (c[5] << 8) | c[6];
-
-      // printf("w=%d, h=%d\n", si->w, si->h);
-      if (!(si->h | si->w))
-        res = VPX_CODEC_UNSUP_BITSTREAM;
-    } else
-      res = VPX_CODEC_UNSUP_BITSTREAM;
+      c += 3;
+      si->w = (((c[0] & 0xf) << 12) | (c[1] << 4) | ((c[2] >> 4) & 0xf)) + 1;
+      si->h = (((c[2] & 0xf) << 12) | (c[3] << 4) | ((c[4] >> 4) & 0xf)) + 1;
+    }
   }
 
   return res;
 }
 
-static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t    *ctx,
+static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t    *ctx,
                                   vpx_codec_stream_info_t *si) {
 
   unsigned int sz;
 
-  if (si->sz >= sizeof(vp8_stream_info_t))
-    sz = sizeof(vp8_stream_info_t);
+  if (si->sz >= sizeof(vp9_stream_info_t))
+    sz = sizeof(vp9_stream_info_t);
   else
     sz = sizeof(vpx_codec_stream_info_t);
 
@@ -293,27 +226,29 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,
 
       cfg.w = ctx->si.w;
       cfg.h = ctx->si.h;
-      ctx->mmaps[i].id = vp8_mem_req_segs[i].id;
-      ctx->mmaps[i].sz = vp8_mem_req_segs[i].sz;
-      ctx->mmaps[i].align = vp8_mem_req_segs[i].align;
-      ctx->mmaps[i].flags = vp8_mem_req_segs[i].flags;
+      ctx->mmaps[i].id = vp9_mem_req_segs[i].id;
+      ctx->mmaps[i].sz = vp9_mem_req_segs[i].sz;
+      ctx->mmaps[i].align = vp9_mem_req_segs[i].align;
+      ctx->mmaps[i].flags = vp9_mem_req_segs[i].flags;
 
       if (!ctx->mmaps[i].sz)
-        ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
+        ctx->mmaps[i].sz = vp9_mem_req_segs[i].calc_sz(&cfg,
                                                        ctx->base.init_flags);
 
-      res = vp8_mmap_alloc(&ctx->mmaps[i]);
+      res = vpx_mmap_alloc(&ctx->mmaps[i]);
     }
 
     if (!res)
-      vp8_finalize_mmaps(ctx);
+      vp9_finalize_mmaps(ctx);
 
     ctx->defer_alloc = 0;
   }
 
   /* Initialize the decoder instance on the first frame*/
   if (!res && !ctx->decoder_init) {
-    res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);
+    res = vpx_validate_mmaps(&ctx->si, ctx->mmaps,
+                             vp9_mem_req_segs, NELEMENTS(vp9_mem_req_segs),
+                             ctx->base.init_flags);
 
     if (!res) {
       VP9D_CONFIG oxcf;
@@ -483,7 +418,7 @@ static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t  *ctx,
   return res;
 }
 
-static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t  *ctx,
+static vpx_image_t *vp9_get_frame(vpx_codec_alg_priv_t  *ctx,
                                   vpx_codec_iter_t      *iter) {
   vpx_image_t *img = NULL;
 
@@ -501,24 +436,22 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t  *ctx,
   return img;
 }
 
-
-static
-vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t      *ctx,
-                                 vpx_codec_mmap_t           *mmap,
-                                 vpx_codec_iter_t           *iter) {
+static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t      *ctx,
+                                        vpx_codec_mmap_t           *mmap,
+                                        vpx_codec_iter_t           *iter) {
   vpx_codec_err_t     res;
   const mem_req_t  *seg_iter = *iter;
 
   /* Get address of next segment request */
   do {
     if (!seg_iter)
-      seg_iter = vp8_mem_req_segs;
-    else if (seg_iter->id != VP8_SEG_MAX)
+      seg_iter = vp9_mem_req_segs;
+    else if (seg_iter->id != VP9_SEG_MAX)
       seg_iter++;
 
     *iter = (vpx_codec_iter_t)seg_iter;
 
-    if (seg_iter->id != VP8_SEG_MAX) {
+    if (seg_iter->id != VP9_SEG_MAX) {
       mmap->id = seg_iter->id;
       mmap->sz = seg_iter->sz;
       mmap->align = seg_iter->align;
@@ -535,15 +468,15 @@ vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t      *ctx,
   return res;
 }
 
-static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t         *ctx,
+static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t         *ctx,
                                         const vpx_codec_mmap_t  *mmap) {
   vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;
   int i, done;
 
   if (!ctx->priv) {
-    if (mmap->id == VP8_SEG_ALG_PRIV) {
+    if (mmap->id == VP9_SEG_ALG_PRIV) {
       if (!ctx->priv) {
-        vp8_init_ctx(ctx, mmap);
+        vp9_init_ctx(ctx, mmap);
         res = VPX_CODEC_OK;
       }
     }
@@ -564,17 +497,16 @@ static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t         *ctx,
   }
 
   if (done && !res) {
-    vp8_finalize_mmaps(ctx->priv->alg_priv);
+    vp9_finalize_mmaps(ctx->priv->alg_priv);
     res = ctx->iface->init(ctx, NULL);
   }
 
   return res;
 }
 
-
-static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx,
-                                         int ctr_id,
-                                         va_list args) {
+static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
+                                     int ctr_id,
+                                     va_list args) {
 
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
@@ -591,9 +523,9 @@ static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx,
 
 }
 
-static vpx_codec_err_t vp9_copy_reference(vpx_codec_alg_priv_t *ctx,
-                                          int ctr_id,
-                                          va_list args) {
+static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx,
+                                      int ctr_id,
+                                      va_list args) {
 
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
@@ -626,9 +558,9 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
   }
 }
 
-static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
-                                        int ctr_id,
-                                        va_list args) {
+static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx,
+                                    int ctr_id,
+                                    va_list args) {
 #if CONFIG_POSTPROC
   vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
 
@@ -644,9 +576,9 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
 #endif
 }
 
-static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
-                                           int ctrl_id,
-                                           va_list args) {
+static vpx_codec_err_t set_dbg_options(vpx_codec_alg_priv_t *ctx,
+                                       int ctrl_id,
+                                       va_list args) {
 #if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
   int data = va_arg(args, int);
 
@@ -665,9 +597,9 @@ static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
 #endif
 }
 
-static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
-                                                int ctrl_id,
-                                                va_list args) {
+static vpx_codec_err_t get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
+                                            int ctrl_id,
+                                            va_list args) {
   int *update_info = va_arg(args, int *);
   VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
 
@@ -680,9 +612,9 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
 }
 
 
-static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
-                                               int ctrl_id,
-                                               va_list args) {
+static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
+                                           int ctrl_id,
+                                           va_list args) {
 
   int *corrupted = va_arg(args, int *);
 
@@ -704,15 +636,15 @@ static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
 }
 
 static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
-  {VP8_SET_REFERENCE,             vp9_set_reference},
-  {VP8_COPY_REFERENCE,            vp9_copy_reference},
-  {VP8_SET_POSTPROC,              vp8_set_postproc},
-  {VP8_SET_DBG_COLOR_REF_FRAME,   vp8_set_dbg_options},
-  {VP8_SET_DBG_COLOR_MB_MODES,    vp8_set_dbg_options},
-  {VP8_SET_DBG_COLOR_B_MODES,     vp8_set_dbg_options},
-  {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_options},
-  {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},
-  {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},
+  {VP8_SET_REFERENCE,             set_reference},
+  {VP8_COPY_REFERENCE,            copy_reference},
+  {VP8_SET_POSTPROC,              set_postproc},
+  {VP8_SET_DBG_COLOR_REF_FRAME,   set_dbg_options},
+  {VP8_SET_DBG_COLOR_MB_MODES,    set_dbg_options},
+  {VP8_SET_DBG_COLOR_B_MODES,     set_dbg_options},
+  {VP8_SET_DBG_DISPLAY_MV,        set_dbg_options},
+  {VP8D_GET_LAST_REF_UPDATES,     get_last_ref_updates},
+  {VP8D_GET_FRAME_CORRUPTED,      get_frame_corrupted},
   {VP9_GET_REFERENCE,             get_reference},
   {VP9_INVERT_TILE_DECODE_ORDER,  set_invert_tile_order},
   { -1, NULL},
@@ -725,18 +657,18 @@ static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
 CODEC_INTERFACE(vpx_codec_vp9_dx) = {
   "WebM Project VP9 Decoder" VERSION_STRING,
   VPX_CODEC_INTERNAL_ABI_VERSION,
-  VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC,
+  VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC,
   /* vpx_codec_caps_t          caps; */
-  vp8_init,         /* vpx_codec_init_fn_t       init; */
-  vp8_destroy,      /* vpx_codec_destroy_fn_t    destroy; */
+  vp9_init,         /* vpx_codec_init_fn_t       init; */
+  vp9_destroy,      /* vpx_codec_destroy_fn_t    destroy; */
   ctf_maps,         /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
-  vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t   get_mmap; */
-  vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t   set_mmap; */
+  vp9_xma_get_mmap, /* vpx_codec_get_mmap_fn_t   get_mmap; */
+  vp9_xma_set_mmap, /* vpx_codec_set_mmap_fn_t   set_mmap; */
   {
-    vp8_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */
-    vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
+    vp9_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */
+    vp9_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
     vp9_decode,       /* vpx_codec_decode_fn_t     decode; */
-    vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
+    vp9_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
   },
   {
     /* encoder functions */
diff --git a/libvpx/vp9/vp9_iface_common.h b/libvpx/vp9/vp9_iface_common.h
index dc41d77..ed0122c 100644
--- a/libvpx/vp9/vp9_iface_common.h
+++ b/libvpx/vp9/vp9_iface_common.h
@@ -29,7 +29,7 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
     img->fmt = VPX_IMG_FMT_I420;
   }
   img->w = yv12->y_stride;
-  img->h = multiple8(yv12->y_height + 2 * VP9BORDERINPIXELS);
+  img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9BORDERINPIXELS, 3);
   img->d_w = yv12->y_crop_width;
   img->d_h = yv12->y_crop_height;
   img->x_chroma_shift = yv12->uv_width < yv12->y_width;
@@ -74,8 +74,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
   yv12->alpha_stride = yv12->alpha_buffer ? img->stride[VPX_PLANE_ALPHA] : 0;
 
   yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
-  yv12->clrtype = REG_YUV;
-
 #if CONFIG_ALPHA
   // For development purposes, force alpha to hold the same data a Y for now.
   yv12->alpha_buffer = yv12->y_buffer;
diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk
index 4bed6c0..dee83c9 100644
--- a/libvpx/vp9/vp9cx.mk
+++ b/libvpx/vp9/vp9cx.mk
@@ -58,6 +58,8 @@ VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
 VP9_CX_SRCS-yes += encoder/vp9_sad_c.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
+VP9_CX_SRCS-yes += encoder/vp9_subexp.c
+VP9_CX_SRCS-yes += encoder/vp9_subexp.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
@@ -73,27 +75,24 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
 
 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h
-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_subtract_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
+ifeq ($(ARCH_X86_64),yes)
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm
+endif
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
diff --git a/libvpx/vp9/vp9dx.mk b/libvpx/vp9/vp9dx.mk
index 7ae3219..6cad293 100644
--- a/libvpx/vp9/vp9dx.mk
+++ b/libvpx/vp9/vp9dx.mk
@@ -17,7 +17,6 @@ VP9_DX_SRCS_REMOVE-no  += $(VP9_COMMON_SRCS_REMOVE-no)
 
 VP9_DX_SRCS-yes += vp9_dx_iface.c
 
-VP9_DX_SRCS-yes += decoder/vp9_asm_dec_offsets.c
 VP9_DX_SRCS-yes += decoder/vp9_dboolhuff.c
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.c
 VP9_DX_SRCS-yes += decoder/vp9_decodframe.c
@@ -33,10 +32,10 @@ VP9_DX_SRCS-yes += decoder/vp9_treereader.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c
 VP9_DX_SRCS-yes += decoder/vp9_idct_blk.c
 VP9_DX_SRCS-yes += decoder/vp9_idct_blk.h
+VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
+VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
 
 VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
 
 VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c
-
-$(eval $(call asm_offsets_template,\
-         vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/vp9_asm_dec_offsets.c))
+VP9_DX_SRCS-$(HAVE_NEON) += decoder/arm/neon/vp9_add_constant_residual_neon$(ASM)
diff --git a/libvpx/vpx/internal/vpx_codec_internal.h b/libvpx/vpx/internal/vpx_codec_internal.h
index d7bcd46..05fed97 100644
--- a/libvpx/vpx/internal/vpx_codec_internal.h
+++ b/libvpx/vpx/internal/vpx_codec_internal.h
@@ -94,9 +94,10 @@ typedef vpx_codec_err_t (*vpx_codec_destroy_fn_t)(vpx_codec_alg_priv_t *ctx);
 
 /*!\brief parse stream info function pointer prototype
  *
- * Performs high level parsing of the bitstream. This function is called by
- * the generic vpx_codec_parse_stream() wrapper function, so plugins implementing
- * this interface may trust the input parameters to be properly initialized.
+ * Performs high level parsing of the bitstream. This function is called by the
+ * generic vpx_codec_peek_stream_info() wrapper function, so plugins
+ * implementing this interface may trust the input parameters to be properly
+ * initialized.
  *
  * \param[in]      data    Pointer to a block of data to parse
  * \param[in]      data_sz Size of the data buffer
@@ -301,7 +302,7 @@ struct vpx_codec_iface {
   vpx_codec_set_mmap_fn_t   set_mmap;    /**< \copydoc ::vpx_codec_set_mmap_fn_t */
   struct vpx_codec_dec_iface {
     vpx_codec_peek_si_fn_t    peek_si;     /**< \copydoc ::vpx_codec_peek_si_fn_t */
-    vpx_codec_get_si_fn_t     get_si;      /**< \copydoc ::vpx_codec_peek_si_fn_t */
+    vpx_codec_get_si_fn_t     get_si;      /**< \copydoc ::vpx_codec_get_si_fn_t */
     vpx_codec_decode_fn_t     decode;      /**< \copydoc ::vpx_codec_decode_fn_t */
     vpx_codec_get_frame_fn_t  get_frame;   /**< \copydoc ::vpx_codec_get_frame_fn_t */
   } dec;
@@ -473,4 +474,30 @@ static void vpx_internal_error(struct vpx_internal_error_info *info,
   if (info->setjmp)
     longjmp(info->jmp, info->error_code);
 }
+
+//------------------------------------------------------------------------------
+// mmap interface
+
+typedef struct {
+  unsigned int   id;
+  unsigned long  sz;
+  unsigned int   align;
+  unsigned int   flags;
+  unsigned long (*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
+} mem_req_t;
+
+// Allocates mmap.priv and sets mmap.base based on mmap.sz/align/flags
+// requirements.
+// Returns #VPX_CODEC_OK on success, #VPX_CODEC_MEM_ERROR otherwise.
+vpx_codec_err_t vpx_mmap_alloc(vpx_codec_mmap_t *mmap);
+
+// Frees mmap.base allocated by a call to vpx_mmap_alloc().
+void vpx_mmap_dtor(vpx_codec_mmap_t *mmap);
+
+// Checks each mmap has the size requirement specificied by mem_reqs.
+// Returns #VPX_CODEC_OK on success, #VPX_CODEC_MEM_ERROR otherwise.
+vpx_codec_err_t vpx_validate_mmaps(const vpx_codec_stream_info_t *si,
+                                   const vpx_codec_mmap_t *mmaps,
+                                   const mem_req_t *mem_reqs, int nreqs,
+                                   vpx_codec_flags_t init_flags);
 #endif
diff --git a/libvpx/vpx/src/vpx_codec.c b/libvpx/vpx/src/vpx_codec.c
index 61d7f4c..1f664ae 100644
--- a/libvpx/vpx/src/vpx_codec.c
+++ b/libvpx/vpx/src/vpx_codec.c
@@ -14,6 +14,7 @@
  *
  */
 #include <stdarg.h>
+#include <stdlib.h>
 #include "vpx/vpx_integer.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
@@ -133,3 +134,51 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t  *ctx,
 
   return SAVE_STATUS(ctx, res);
 }
+
+//------------------------------------------------------------------------------
+// mmap interface
+
+vpx_codec_err_t vpx_mmap_alloc(vpx_codec_mmap_t *mmap) {
+  unsigned int align = mmap->align ? mmap->align - 1 : 0;
+
+  if (mmap->flags & VPX_CODEC_MEM_ZERO)
+    mmap->priv = calloc(1, mmap->sz + align);
+  else
+    mmap->priv = malloc(mmap->sz + align);
+
+  if (mmap->priv == NULL) return VPX_CODEC_MEM_ERROR;
+  mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
+  mmap->dtor = vpx_mmap_dtor;
+  return VPX_CODEC_OK;
+}
+
+void vpx_mmap_dtor(vpx_codec_mmap_t *mmap) {
+  free(mmap->priv);
+}
+
+vpx_codec_err_t vpx_validate_mmaps(const vpx_codec_stream_info_t *si,
+                                   const vpx_codec_mmap_t *mmaps,
+                                   const mem_req_t *mem_reqs, int nreqs,
+                                   vpx_codec_flags_t init_flags) {
+  int i;
+
+  for (i = 0; i < nreqs - 1; ++i) {
+    /* Ensure the segment has been allocated */
+    if (mmaps[i].base == NULL) {
+      return VPX_CODEC_MEM_ERROR;
+    }
+
+    /* Verify variable size segment is big enough for the current si. */
+    if (mem_reqs[i].calc_sz != NULL) {
+      vpx_codec_dec_cfg_t cfg;
+
+      cfg.w = si->w;
+      cfg.h = si->h;
+
+      if (mmaps[i].sz < mem_reqs[i].calc_sz(&cfg, init_flags)) {
+        return VPX_CODEC_MEM_ERROR;
+      }
+    }
+  }
+  return VPX_CODEC_OK;
+}
diff --git a/libvpx/vpx_scale/generic/yv12config.c b/libvpx/vpx_scale/generic/yv12config.c
index 754a615..b18155b 100644
--- a/libvpx/vpx_scale/generic/yv12config.c
+++ b/libvpx/vpx_scale/generic/yv12config.c
@@ -170,6 +170,8 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
     ybf->y_height = aligned_height;
     ybf->y_stride = y_stride;
 
+    ybf->uv_crop_width = (width + ss_x) >> ss_x;
+    ybf->uv_crop_height = (height + ss_y) >> ss_y;
     ybf->uv_width = uv_width;
     ybf->uv_height = uv_height;
     ybf->uv_stride = uv_stride;
diff --git a/libvpx/vpx_scale/generic/yv12extend.c b/libvpx/vpx_scale/generic/yv12extend.c
index c38fb80..cc8da2a 100644
--- a/libvpx/vpx_scale/generic/yv12extend.c
+++ b/libvpx/vpx_scale/generic/yv12extend.c
@@ -96,15 +96,16 @@ vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
 }
 
 #if CONFIG_VP9
-void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
-                                int subsampling_x, int subsampling_y) {
+static void extend_frame(YV12_BUFFER_CONFIG *ybf,
+                        int subsampling_x, int subsampling_y,
+                        int ext_size) {
   const int c_w = (ybf->y_crop_width + subsampling_x) >> subsampling_x;
   const int c_h = (ybf->y_crop_height + subsampling_y) >> subsampling_y;
-  const int c_et = ybf->border >> subsampling_y;
-  const int c_el = ybf->border >> subsampling_x;
-  const int c_eb = (ybf->border + ybf->y_height - ybf->y_crop_height +
+  const int c_et = ext_size >> subsampling_y;
+  const int c_el = ext_size >> subsampling_x;
+  const int c_eb = (ext_size + ybf->y_height - ybf->y_crop_height +
                     subsampling_y) >> subsampling_y;
-  const int c_er = (ybf->border + ybf->y_width - ybf->y_crop_width +
+  const int c_er = (ext_size + ybf->y_width - ybf->y_crop_width +
                     subsampling_x) >> subsampling_x;
 
   assert(ybf->y_height - ybf->y_crop_height < 16);
@@ -114,9 +115,9 @@ void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
 
   extend_plane(ybf->y_buffer, ybf->y_stride,
                ybf->y_crop_width, ybf->y_crop_height,
-               ybf->border, ybf->border,
-               ybf->border + ybf->y_height - ybf->y_crop_height,
-               ybf->border + ybf->y_width - ybf->y_crop_width);
+               ext_size, ext_size,
+               ext_size + ybf->y_height - ybf->y_crop_height,
+               ext_size + ybf->y_width - ybf->y_crop_width);
 
   extend_plane(ybf->u_buffer, ybf->uv_stride,
                c_w, c_h, c_et, c_el, c_eb, c_er);
@@ -124,6 +125,19 @@ void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
   extend_plane(ybf->v_buffer, ybf->uv_stride,
                c_w, c_h, c_et, c_el, c_eb, c_er);
 }
+
+
+void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
+                                int subsampling_x, int subsampling_y) {
+  extend_frame(ybf, subsampling_x, subsampling_y, ybf->border);
+}
+
+void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf,
+                                      int subsampling_x, int subsampling_y) {
+  const int inner_bw = ybf->border > VP9INNERBORDERINPIXLES ?
+                       VP9INNERBORDERINPIXLES : ybf->border;
+  extend_frame(ybf, subsampling_x, subsampling_y, inner_bw);
+}
 #endif
 
 /****************************************************************************
diff --git a/libvpx/vpx_scale/vpx_scale_rtcd.sh b/libvpx/vpx_scale/vpx_scale_rtcd.sh
index b4f8907..21d1e52 100644
--- a/libvpx/vpx_scale/vpx_scale_rtcd.sh
+++ b/libvpx/vpx_scale/vpx_scale_rtcd.sh
@@ -28,4 +28,7 @@ specialize vp8_yv12_copy_y neon
 if [ "$CONFIG_VP9" = "yes" ]; then
     prototype void vp9_extend_frame_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
     specialize vp9_extend_frame_borders
+
+    prototype void vp9_extend_frame_inner_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
+    specialize vp9_extend_frame_inner_borders_c
 fi
diff --git a/libvpx/vpx_scale/yv12config.h b/libvpx/vpx_scale/yv12config.h
index 7b8bd85..66e587a 100644
--- a/libvpx/vpx_scale/yv12config.h
+++ b/libvpx/vpx_scale/yv12config.h
@@ -18,27 +18,10 @@ extern "C" {
 #include "vpx/vpx_integer.h"
 
 #define VP8BORDERINPIXELS       32
-#define VP9BORDERINPIXELS       96
+#define VP9INNERBORDERINPIXLES  96
+#define VP9BORDERINPIXELS      160
 #define VP9_INTERP_EXTEND        4
 
-  /*************************************
-   For INT_YUV:
-
-   Y = (R+G*2+B)/4;
-   U = (R-B)/2;
-   V =  (G*2 - R - B)/4;
-  And
-   R = Y+U-V;
-   G = Y+V;
-   B = Y-U-V;
-  ************************************/
-  typedef enum
-  {
-    REG_YUV = 0,    /* Regular yuv */
-    INT_YUV = 1     /* The type of yuv that can be tranfer to and from RGB through integer transform */
-  }
-            YUV_TYPE;
-
   typedef struct yv12_buffer_config {
     int   y_width;
     int   y_height;
@@ -49,6 +32,8 @@ extern "C" {
 
     int   uv_width;
     int   uv_height;
+    int   uv_crop_width;
+    int   uv_crop_height;
     int   uv_stride;
     /*    int   uvinternal_width; */
 
@@ -65,7 +50,6 @@ extern "C" {
     int buffer_alloc_sz;
     int border;
     int frame_size;
-    YUV_TYPE clrtype;
 
     int corrupted;
     int flags;
diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c
index a60b84d..547b572 100644
--- a/libvpx/vpxenc.c
+++ b/libvpx/vpxenc.c
@@ -1180,22 +1180,22 @@ static void usage_exit() {
           exec_name);
 
   fprintf(stderr, "\nOptions:\n");
-  arg_show_usage(stdout, main_args);
+  arg_show_usage(stderr, main_args);
   fprintf(stderr, "\nEncoder Global Options:\n");
-  arg_show_usage(stdout, global_args);
+  arg_show_usage(stderr, global_args);
   fprintf(stderr, "\nRate Control Options:\n");
-  arg_show_usage(stdout, rc_args);
+  arg_show_usage(stderr, rc_args);
   fprintf(stderr, "\nTwopass Rate Control Options:\n");
-  arg_show_usage(stdout, rc_twopass_args);
+  arg_show_usage(stderr, rc_twopass_args);
   fprintf(stderr, "\nKeyframe Placement Options:\n");
-  arg_show_usage(stdout, kf_args);
+  arg_show_usage(stderr, kf_args);
 #if CONFIG_VP8_ENCODER
   fprintf(stderr, "\nVP8 Specific Options:\n");
-  arg_show_usage(stdout, vp8_args);
+  arg_show_usage(stderr, vp8_args);
 #endif
 #if CONFIG_VP9_ENCODER
   fprintf(stderr, "\nVP9 Specific Options:\n");
-  arg_show_usage(stdout, vp9_args);
+  arg_show_usage(stderr, vp9_args);
 #endif
   fprintf(stderr, "\nStream timebase (--timebase):\n"
           "  The desired precision of timestamps in the output, expressed\n"
diff --git a/mips-dspr2/libvpx_srcs.txt b/mips-dspr2/libvpx_srcs.txt
index e74102e..d756208 100644
--- a/mips-dspr2/libvpx_srcs.txt
+++ b/mips-dspr2/libvpx_srcs.txt
@@ -66,7 +66,6 @@ vp8/common/treecoder.c
 vp8/common/treecoder.h
 vp8/common/variance_c.c
 vp8/common/variance.h
-vp8/common/vp8_asm_com_offsets.c
 vp8/common/vp8_entropymodedata.h
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
@@ -80,7 +79,6 @@ vp8/decoder/onyxd_if.c
 vp8/decoder/onyxd_int.h
 vp8/decoder/threading.c
 vp8/decoder/treereader.h
-vp8/decoder/vp8_asm_dec_offsets.c
 vp8/encoder/bitstream.c
 vp8/encoder/bitstream.h
 vp8/encoder/block.h
@@ -136,8 +134,9 @@ vp8/vp8dx.mk
 vp9/common/generic/vp9_systemdependent.c
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
-vp9/common/vp9_asm_com_offsets.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common_data.c
+vp9/common/vp9_common_data.h
 vp9/common/vp9_common.h
 vp9/common/vp9_convolve.c
 vp9/common/vp9_convolve.h
@@ -161,10 +160,6 @@ vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
 vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
-vp9/common/vp9_mbpitch.c
-vp9/common/vp9_modecont.c
-vp9/common/vp9_modecontext.c
-vp9/common/vp9_modecont.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
 vp9/common/vp9_mvref_common.h
@@ -192,7 +187,6 @@ vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
 vp9/common/vp9_treecoder.c
 vp9/common/vp9_treecoder.h
-vp9/decoder/vp9_asm_dec_offsets.c
 vp9/decoder/vp9_dboolhuff.c
 vp9/decoder/vp9_dboolhuff.h
 vp9/decoder/vp9_decodemv.c
@@ -201,6 +195,8 @@ vp9/decoder/vp9_decodframe.c
 vp9/decoder/vp9_decodframe.h
 vp9/decoder/vp9_detokenize.c
 vp9/decoder/vp9_detokenize.h
+vp9/decoder/vp9_dsubexp.c
+vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_idct_blk.c
 vp9/decoder/vp9_idct_blk.h
 vp9/decoder/vp9_onyxd.h
diff --git a/mips-dspr2/vp9_rtcd.h b/mips-dspr2/vp9_rtcd.h
index 2905eae..0752f45 100644
--- a/mips-dspr2/vp9_rtcd.h
+++ b/mips-dspr2/vp9_rtcd.h
@@ -38,28 +38,161 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
 void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
 #define vp9_idct_add_32x32 vp9_idct_add_32x32_c
 
-void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-void vp9_copy_mem16x16_dspr2(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem16x16 vp9_copy_mem16x16_dspr2
+void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
 
-void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-void vp9_copy_mem8x8_dspr2(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x8 vp9_copy_mem8x8_dspr2
+void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
 
-void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x4 vp9_copy_mem8x4_c
+void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
 
-void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available);
-#define vp9_build_intra_predictors vp9_build_intra_predictors_c
+void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
 
-void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c
+void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
 
-void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c
+void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
 
-void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride);
-#define vp9_intra4x4_predict vp9_intra4x4_predict_c
+void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
+
+void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
+
+void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
+
+void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
+
+void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
+
+void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
+
+void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
+
+void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+
+void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
+
+void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
+
+void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
+
+void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
+
+void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
+
+void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
+
+void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
+
+void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
+
+void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
+
+void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
+
+void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
+
+void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
+
+void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+
+void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
+
+void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
+
+void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
+
+void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
+
+void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
+
+void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
+
+void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
+
+void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
+
+void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
+
+void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
+
+void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
+
+void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
+
+void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+
+void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
+
+void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
+
+void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+
+void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
+
+void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
+
+void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
+
+void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
+
+void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
+
+void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
+
+void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
+
+void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
+
+void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
 
 void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
 #define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c
@@ -79,7 +212,7 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *bli
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
 
-void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
@@ -97,22 +230,28 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in
 void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_b vp9_blend_b_c
 
-void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_copy vp9_convolve_copy_c
+
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_avg vp9_convolve_avg_c
+
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8 vp9_convolve8_c
 
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_horiz vp9_convolve8_horiz_c
 
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_vert vp9_convolve8_vert_c
 
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_avg vp9_convolve8_avg_c
 
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c
 
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c
 
 void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
@@ -160,9 +299,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx
 void vp9_idct4_1d_c(int16_t *input, int16_t *output);
 #define vp9_idct4_1d vp9_idct4_1d_c
 
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride);
-#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c
-
 void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c
 
diff --git a/mips-dspr2/vpx_config.h b/mips-dspr2/vpx_config.h
index 0ca4657..13a092d 100644
--- a/mips-dspr2/vpx_config.h
+++ b/mips-dspr2/vpx_config.h
@@ -87,5 +87,4 @@
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
-#define CONFIG_BALANCED_COEFTREE 0
 #endif /* VPX_CONFIG_H */
diff --git a/mips-dspr2/vpx_scale_rtcd.h b/mips-dspr2/vpx_scale_rtcd.h
index 7af466a..be038f4 100644
--- a/mips-dspr2/vpx_scale_rtcd.h
+++ b/mips-dspr2/vpx_scale_rtcd.h
@@ -42,6 +42,9 @@ void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_co
 void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
 #define vp9_extend_frame_borders vp9_extend_frame_borders_c
 
+void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
+#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+
 void vpx_scale_rtcd(void);
 #include "vpx_config.h"
 
diff --git a/mips/libvpx_srcs.txt b/mips/libvpx_srcs.txt
index 8c1ec80..402ac24 100644
--- a/mips/libvpx_srcs.txt
+++ b/mips/libvpx_srcs.txt
@@ -60,7 +60,6 @@ vp8/common/treecoder.c
 vp8/common/treecoder.h
 vp8/common/variance_c.c
 vp8/common/variance.h
-vp8/common/vp8_asm_com_offsets.c
 vp8/common/vp8_entropymodedata.h
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
@@ -74,7 +73,6 @@ vp8/decoder/onyxd_if.c
 vp8/decoder/onyxd_int.h
 vp8/decoder/threading.c
 vp8/decoder/treereader.h
-vp8/decoder/vp8_asm_dec_offsets.c
 vp8/encoder/bitstream.c
 vp8/encoder/bitstream.h
 vp8/encoder/block.h
@@ -130,8 +128,9 @@ vp8/vp8dx.mk
 vp9/common/generic/vp9_systemdependent.c
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
-vp9/common/vp9_asm_com_offsets.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common_data.c
+vp9/common/vp9_common_data.h
 vp9/common/vp9_common.h
 vp9/common/vp9_convolve.c
 vp9/common/vp9_convolve.h
@@ -155,10 +154,6 @@ vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
 vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
-vp9/common/vp9_mbpitch.c
-vp9/common/vp9_modecont.c
-vp9/common/vp9_modecontext.c
-vp9/common/vp9_modecont.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
 vp9/common/vp9_mvref_common.h
@@ -186,7 +181,6 @@ vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
 vp9/common/vp9_treecoder.c
 vp9/common/vp9_treecoder.h
-vp9/decoder/vp9_asm_dec_offsets.c
 vp9/decoder/vp9_dboolhuff.c
 vp9/decoder/vp9_dboolhuff.h
 vp9/decoder/vp9_decodemv.c
@@ -195,6 +189,8 @@ vp9/decoder/vp9_decodframe.c
 vp9/decoder/vp9_decodframe.h
 vp9/decoder/vp9_detokenize.c
 vp9/decoder/vp9_detokenize.h
+vp9/decoder/vp9_dsubexp.c
+vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_idct_blk.c
 vp9/decoder/vp9_idct_blk.h
 vp9/decoder/vp9_onyxd.h
diff --git a/mips/vp9_rtcd.h b/mips/vp9_rtcd.h
index 1d7b4d2..0752f45 100644
--- a/mips/vp9_rtcd.h
+++ b/mips/vp9_rtcd.h
@@ -38,26 +38,161 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
 void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
 #define vp9_idct_add_32x32 vp9_idct_add_32x32_c
 
-void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem16x16 vp9_copy_mem16x16_c
+void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
 
-void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x8 vp9_copy_mem8x8_c
+void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
 
-void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x4 vp9_copy_mem8x4_c
+void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
 
-void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available);
-#define vp9_build_intra_predictors vp9_build_intra_predictors_c
+void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
 
-void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c
+void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
 
-void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c
+void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
 
-void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride);
-#define vp9_intra4x4_predict vp9_intra4x4_predict_c
+void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
+
+void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
+
+void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
+
+void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
+
+void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
+
+void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
+
+void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
+
+void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+
+void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
+
+void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
+
+void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
+
+void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
+
+void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
+
+void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
+
+void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
+
+void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
+
+void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
+
+void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
+
+void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
+
+void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
+
+void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+
+void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
+
+void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
+
+void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
+
+void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
+
+void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
+
+void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
+
+void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
+
+void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
+
+void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
+
+void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
+
+void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
+
+void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
+
+void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+
+void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
+
+void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
+
+void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+
+void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
+
+void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
+
+void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
+
+void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
+
+void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
+
+void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
+
+void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
+
+void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
+
+void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
 
 void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
 #define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c
@@ -77,7 +212,7 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *bli
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
 
-void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
@@ -95,22 +230,28 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in
 void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_b vp9_blend_b_c
 
-void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_copy vp9_convolve_copy_c
+
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_avg vp9_convolve_avg_c
+
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8 vp9_convolve8_c
 
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_horiz vp9_convolve8_horiz_c
 
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_vert vp9_convolve8_vert_c
 
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_avg vp9_convolve8_avg_c
 
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c
 
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 #define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c
 
 void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
@@ -158,9 +299,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx
 void vp9_idct4_1d_c(int16_t *input, int16_t *output);
 #define vp9_idct4_1d vp9_idct4_1d_c
 
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride);
-#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c
-
 void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c
 
diff --git a/mips/vpx_config.h b/mips/vpx_config.h
index 49eab1e..51ea388 100644
--- a/mips/vpx_config.h
+++ b/mips/vpx_config.h
@@ -87,5 +87,4 @@
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
-#define CONFIG_BALANCED_COEFTREE 0
 #endif /* VPX_CONFIG_H */
diff --git a/mips/vpx_scale_rtcd.h b/mips/vpx_scale_rtcd.h
index 7af466a..be038f4 100644
--- a/mips/vpx_scale_rtcd.h
+++ b/mips/vpx_scale_rtcd.h
@@ -42,6 +42,9 @@ void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_co
 void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
 #define vp9_extend_frame_borders vp9_extend_frame_borders_c
 
+void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
+#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+
 void vpx_scale_rtcd(void);
 #include "vpx_config.h"
author	hkuang <hkuang@google.com>	2013-07-25 11:11:39 -0700
committer	hkuang <hkuang@google.com>	2013-07-25 12:03:12 -0700
commit	91037db265ecdd914a26e056cf69207b4f50924e (patch)
tree	c78c618cf6d0ffb187e2734d524bca19698b3c0d
parent	ba164dffc5a6795bce97fae02b51ccf3330e15e4 (diff)
download	android_external_libvpx-91037db265ecdd914a26e056cf69207b4f50924e.tar.gz android_external_libvpx-91037db265ecdd914a26e056cf69207b4f50924e.tar.bz2 android_external_libvpx-91037db265ecdd914a26e056cf69207b4f50924e.zip