Roll latest libvpx into Android.

The latest libvpx has more neon optimizations and a lot of algorithm optimizations which make the vp9 decode much more faster. bug:10804666 Change-Id: I75eaacea57ecc7542a780be778f0e9e157978524 (cherry picked from commit 3df0563f1b24dac6c0bd122fc922a48211269061)
author: hkuang <hkuang@google.com> 2013-09-16 15:09:58 -0700
committer: Hangyu Kuang <hkuang@google.com> 2013-09-17 22:05:28 +0000
commit: 1184aebb761cbeac9124c37189a80a1a58f04b6b (patch)
tree: b1ce6b3d29c43ffd22eb18999c5c3bad26513a48
parent: f3bed9137f66ef693bd406e43b17e9a1114f1e14 (diff)
download: android_external_libvpx-1184aebb761cbeac9124c37189a80a1a58f04b6b.tar.gz
android_external_libvpx-1184aebb761cbeac9124c37189a80a1a58f04b6b.tar.bz2
android_external_libvpx-1184aebb761cbeac9124c37189a80a1a58f04b6b.zip
189 files changed, 16429 insertions, 9643 deletions
diff --git a/armv7a-neon/libvpx_srcs.txt b/armv7a-neon/libvpx_srcs.txt
index 25ca5e0..8f8b655 100644
--- a/armv7a-neon/libvpx_srcs.txt
+++ b/armv7a-neon/libvpx_srcs.txt
@@ -203,13 +203,25 @@ vp8/vp8_cx_iface.c
 vp8/vp8cx.mk
 vp8/vp8_dx_iface.c
 vp8/vp8dx.mk
+vp9/common/arm/neon/vp9_avg_neon.asm.s
 vp9/common/arm/neon/vp9_convolve8_avg_neon.asm.s
 vp9/common/arm/neon/vp9_convolve8_neon.asm.s
 vp9/common/arm/neon/vp9_convolve_neon.c
+vp9/common/arm/neon/vp9_copy_neon.asm.s
 vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm.s
+vp9/common/arm/neon/vp9_idct16x16_neon.c
+vp9/common/arm/neon/vp9_idct32x32_neon.c
 vp9/common/arm/neon/vp9_loopfilter_neon.asm.s
 vp9/common/arm/neon/vp9_mb_lpf_neon.asm.s
+vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm.s
+vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm.s
+vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm.s
+vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm.s
+vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm.s
+vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm.s
 vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm.s
+vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm.s
+vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm.s
 vp9/common/generic/vp9_systemdependent.c
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
@@ -257,6 +269,8 @@ vp9/common/vp9_reconintra.h
 vp9/common/vp9_rtcd.c
 vp9/common/vp9_rtcd_defs.sh
 vp9/common/vp9_sadmxn.h
+vp9/common/vp9_scale.c
+vp9/common/vp9_scale.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
 vp9/common/vp9_subpelvar.h
diff --git a/armv7a-neon/vp9_rtcd.h b/armv7a-neon/vp9_rtcd.h
index 4ebb497..fdca309 100644
--- a/armv7a-neon/vp9_rtcd.h
+++ b/armv7a-neon/vp9_rtcd.h
@@ -36,160 +36,160 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
 void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
 #define vp9_idct_add_32x32 vp9_idct_add_32x32_c
 
-void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
+void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c
 
-void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
 
-void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
 
-void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
 
-void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
 
-void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
 
-void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
 
-void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
 
-void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
 
-void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
 
-void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
 
-void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
 
-void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
 
-void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c
 
-void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
 
-void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
 
-void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
 
-void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
 
-void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
 
-void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
 
-void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
 
-void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
 
-void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
 
-void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
 
-void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
 
-void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
 
-void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c
 
-void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
 
-void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
 
-void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
 
-void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
 
-void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
 
-void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
 
-void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
 
-void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
 
-void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
 
-void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
 
-void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
 
-void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
 
-void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c
 
-void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
 
-void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
 
-void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
 
-void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
 
-void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
 
-void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
 
-void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
 
-void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
 
-void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
 
-void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
 
-void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
 
-void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
 
 void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
@@ -238,10 +238,12 @@ void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, i
 #define vp9_blend_b vp9_blend_b_c
 
 void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_copy vp9_convolve_copy_c
+void vp9_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_copy vp9_convolve_copy_neon
 
 void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_avg vp9_convolve_avg_c
+void vp9_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_avg vp9_convolve_avg_neon
 
 void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -268,41 +270,51 @@ void vp9_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8
 #define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_neon
 
 void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct4x4_1_add vp9_short_idct4x4_1_add_c
+void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct4x4_1_add vp9_short_idct4x4_1_add_neon
 
 void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct4x4_add vp9_short_idct4x4_add_c
+void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct4x4_add vp9_short_idct4x4_add_neon
 
 void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct8x8_1_add vp9_short_idct8x8_1_add_c
+void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct8x8_1_add vp9_short_idct8x8_1_add_neon
 
 void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct8x8_add vp9_short_idct8x8_add_neon
 
 void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct10_8x8_add vp9_short_idct10_8x8_add_c
+void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct10_8x8_add vp9_short_idct10_8x8_add_neon
 
 void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct16x16_1_add vp9_short_idct16x16_1_add_c
+void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct16x16_1_add vp9_short_idct16x16_1_add_neon
 
 void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct16x16_add vp9_short_idct16x16_add_c
+void vp9_short_idct16x16_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct16x16_add vp9_short_idct16x16_add_neon
 
 void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct10_16x16_add vp9_short_idct10_16x16_add_c
+void vp9_short_idct10_16x16_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct10_16x16_add vp9_short_idct10_16x16_add_neon
 
 void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct32x32_add vp9_short_idct32x32_add_c
+void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct32x32_add vp9_short_idct32x32_add_neon
 
 void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output);
 #define vp9_short_idct1_32x32 vp9_short_idct1_32x32_c
 
 void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_short_iht4x4_add vp9_short_iht4x4_add_c
+void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_short_iht4x4_add vp9_short_iht4x4_add_neon
 
 void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_short_iht8x8_add vp9_short_iht8x8_add_c
+void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_short_iht8x8_add vp9_short_iht8x8_add_neon
 
 void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_short_iht16x16_add vp9_short_iht16x16_add_c
@@ -316,12 +328,6 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride)
 void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_iwalsh4x4_add vp9_short_iwalsh4x4_add_c
 
-unsigned int vp9_sad32x3_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad);
-#define vp9_sad32x3 vp9_sad32x3_c
-
-unsigned int vp9_sad3x32_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad);
-#define vp9_sad3x32 vp9_sad3x32_c
-
 void vp9_rtcd(void);
 #include "vpx_config.h"
 
diff --git a/armv7a-neon/vpx_config.c b/armv7a-neon/vpx_config.c
index 77be6fb..dc64ce3 100644
--- a/armv7a-neon/vpx_config.c
+++ b/armv7a-neon/vpx_config.c
@@ -5,5 +5,5 @@
 /* tree. An additional intellectual property rights grant can be found */
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--target=armv7-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/johannkoenig/android-ndk --disable-vp9-encoder --disable-examples --disable-docs --enable-realtime-only";
+static const char* const cfg = "--target=armv7-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/hkuang/Downloads/android-ndk-r8e --disable-vp9-encoder --disable-examples --disable-docs --enable-realtime-only";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/armv7a-neon/vpx_config.h b/armv7a-neon/vpx_config.h
index d132e4d..452bc91 100644
--- a/armv7a-neon/vpx_config.h
+++ b/armv7a-neon/vpx_config.h
@@ -59,6 +59,7 @@
 #define CONFIG_DC_RECON 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
 #define CONFIG_POSTPROC 0
+#define CONFIG_VP9_POSTPROC 0
 #define CONFIG_MULTITHREAD 1
 #define CONFIG_INTERNAL_STATS 0
 #define CONFIG_VP8_ENCODER 1
diff --git a/armv7a-neon/vpx_scale_rtcd.h b/armv7a-neon/vpx_scale_rtcd.h
index 9972777..8c2ab2f 100644
--- a/armv7a-neon/vpx_scale_rtcd.h
+++ b/armv7a-neon/vpx_scale_rtcd.h
@@ -34,13 +34,13 @@ void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vp8_yv12_extend_frame_borders_neon(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_neon
 
-void vp8_yv12_copy_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-void vp8_yv12_copy_frame_neon(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void vp8_yv12_copy_frame_neon(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vp8_yv12_copy_frame vp8_yv12_copy_frame_neon
 
-void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-void vp8_yv12_copy_y_neon(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_y vp8_yv12_copy_y_neon
+void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void vpx_yv12_copy_y_neon(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_y vpx_yv12_copy_y_neon
 
 void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
 #define vp9_extend_frame_borders vp9_extend_frame_borders_c
diff --git a/armv7a/libvpx_srcs.txt b/armv7a/libvpx_srcs.txt
index 2ddb1bd..8f41f9f 100644
--- a/armv7a/libvpx_srcs.txt
+++ b/armv7a/libvpx_srcs.txt
@@ -212,6 +212,8 @@ vp9/common/vp9_reconintra.h
 vp9/common/vp9_rtcd.c
 vp9/common/vp9_rtcd_defs.sh
 vp9/common/vp9_sadmxn.h
+vp9/common/vp9_scale.c
+vp9/common/vp9_scale.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
 vp9/common/vp9_subpelvar.h
diff --git a/armv7a/vp9_rtcd.h b/armv7a/vp9_rtcd.h
index 1ce24c5..36202d2 100644
--- a/armv7a/vp9_rtcd.h
+++ b/armv7a/vp9_rtcd.h
@@ -36,160 +36,160 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
 void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
 #define vp9_idct_add_32x32 vp9_idct_add_32x32_c
 
-void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
+void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c
 
-void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
 
-void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
 
-void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
 
-void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
 
-void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
 
-void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
 
-void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
 
-void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
 
-void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
 
-void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
 
-void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
 
-void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
 
-void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c
 
-void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
 
-void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
 
-void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
 
-void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
 
-void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
 
-void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
 
-void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
 
-void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
 
-void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
 
-void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
 
-void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
 
-void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
 
-void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c
 
-void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
 
-void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
 
-void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
 
-void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
 
-void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
 
-void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
 
-void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
 
-void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
 
-void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
 
-void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
 
-void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
 
-void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
 
-void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c
 
-void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
 
-void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
 
-void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
 
-void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
 
-void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
 
-void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
 
-void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
 
-void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
 
-void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
 
-void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
 
-void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
 
-void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
 
 void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
@@ -300,12 +300,6 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride)
 void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_iwalsh4x4_add vp9_short_iwalsh4x4_add_c
 
-unsigned int vp9_sad32x3_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad);
-#define vp9_sad32x3 vp9_sad32x3_c
-
-unsigned int vp9_sad3x32_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad);
-#define vp9_sad3x32 vp9_sad3x32_c
-
 void vp9_rtcd(void);
 #include "vpx_config.h"
 
diff --git a/armv7a/vpx_config.c b/armv7a/vpx_config.c
index a246c39..ecdb0cf 100644
--- a/armv7a/vpx_config.c
+++ b/armv7a/vpx_config.c
@@ -5,5 +5,5 @@
 /* tree. An additional intellectual property rights grant can be found */
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--target=armv7-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/johannkoenig/android-ndk --disable-vp9-encoder --disable-neon --disable-examples --disable-docs --enable-realtime-only";
+static const char* const cfg = "--target=armv7-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/hkuang/Downloads/android-ndk-r8e --disable-vp9-encoder --disable-neon --disable-examples --disable-docs --enable-realtime-only";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/armv7a/vpx_config.h b/armv7a/vpx_config.h
index a330023..c789546 100644
--- a/armv7a/vpx_config.h
+++ b/armv7a/vpx_config.h
@@ -59,6 +59,7 @@
 #define CONFIG_DC_RECON 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
 #define CONFIG_POSTPROC 0
+#define CONFIG_VP9_POSTPROC 0
 #define CONFIG_MULTITHREAD 1
 #define CONFIG_INTERNAL_STATS 0
 #define CONFIG_VP8_ENCODER 1
diff --git a/armv7a/vpx_scale_rtcd.h b/armv7a/vpx_scale_rtcd.h
index d4212f2..0df8b37 100644
--- a/armv7a/vpx_scale_rtcd.h
+++ b/armv7a/vpx_scale_rtcd.h
@@ -33,11 +33,11 @@ void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pit
 void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
 
-void vp8_yv12_copy_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c
 
-void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_y vp8_yv12_copy_y_c
+void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
 void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
 #define vp9_extend_frame_borders vp9_extend_frame_borders_c
diff --git a/generic/libvpx_srcs.txt b/generic/libvpx_srcs.txt
index 055f5fb..8e6fad7 100644
--- a/generic/libvpx_srcs.txt
+++ b/generic/libvpx_srcs.txt
@@ -172,6 +172,8 @@ vp9/common/vp9_reconintra.h
 vp9/common/vp9_rtcd.c
 vp9/common/vp9_rtcd_defs.sh
 vp9/common/vp9_sadmxn.h
+vp9/common/vp9_scale.c
+vp9/common/vp9_scale.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
 vp9/common/vp9_subpelvar.h
diff --git a/generic/vp9_rtcd.h b/generic/vp9_rtcd.h
index 2562e82..4dcc1f6 100644
--- a/generic/vp9_rtcd.h
+++ b/generic/vp9_rtcd.h
@@ -36,160 +36,160 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
 void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
 #define vp9_idct_add_32x32 vp9_idct_add_32x32_c
 
-void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
+void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c
 
-void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
 
-void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
 
-void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
 
-void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
 
-void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
 
-void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
 
-void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
 
-void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
 
-void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
 
-void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
 
-void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
 
-void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
 
-void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c
 
-void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
 
-void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
 
-void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
 
-void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
 
-void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
 
-void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
 
-void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
 
-void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
 
-void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
 
-void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
 
-void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
 
-void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
 
-void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c
 
-void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
 
-void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
 
-void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
 
-void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
 
-void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
 
-void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
 
-void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
 
-void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
 
-void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
 
-void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
 
-void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
 
-void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
 
-void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c
 
-void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
 
-void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
 
-void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
 
-void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
 
-void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
 
-void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
 
-void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
 
-void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
 
-void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
 
-void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
 
-void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
 
-void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
 
 void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
@@ -300,12 +300,6 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride)
 void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_iwalsh4x4_add vp9_short_iwalsh4x4_add_c
 
-unsigned int vp9_sad32x3_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad);
-#define vp9_sad32x3 vp9_sad32x3_c
-
-unsigned int vp9_sad3x32_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad);
-#define vp9_sad3x32 vp9_sad3x32_c
-
 void vp9_rtcd(void);
 #include "vpx_config.h"
 
diff --git a/generic/vpx_config.h b/generic/vpx_config.h
index 4d6172b..c856d4d 100644
--- a/generic/vpx_config.h
+++ b/generic/vpx_config.h
@@ -59,6 +59,7 @@
 #define CONFIG_DC_RECON 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
 #define CONFIG_POSTPROC 0
+#define CONFIG_VP9_POSTPROC 0
 #define CONFIG_MULTITHREAD 1
 #define CONFIG_INTERNAL_STATS 0
 #define CONFIG_VP8_ENCODER 1
diff --git a/generic/vpx_scale_rtcd.h b/generic/vpx_scale_rtcd.h
index c2842ee..472a290 100644
--- a/generic/vpx_scale_rtcd.h
+++ b/generic/vpx_scale_rtcd.h
@@ -33,11 +33,11 @@ void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pit
 void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
 
-void vp8_yv12_copy_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c
 
-void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_y vp8_yv12_copy_y_c
+void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
 void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
 #define vp9_extend_frame_borders vp9_extend_frame_borders_c
diff --git a/libvpx/build/make/armlink_adapter.sh b/libvpx/build/make/armlink_adapter.sh
index b53669c..75c342e 100755
--- a/libvpx/build/make/armlink_adapter.sh
+++ b/libvpx/build/make/armlink_adapter.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
@@ -13,20 +13,20 @@
 verbose=0
 set -- $*
 for i; do
-    if [ "$i" == "-o" ]; then
+    if [ "$i" = "-o" ]; then
         on_of=1
-    elif [ "$i" == "-v" ]; then
+    elif [ "$i" = "-v" ]; then
         verbose=1
-    elif [ "$i" == "-g" ]; then
+    elif [ "$i" = "-g" ]; then
         args="${args} --debug"
-    elif [ "$on_of" == "1" ]; then
+    elif [ "$on_of" = "1" ]; then
         outfile=$i
         on_of=0
     elif [ -f "$i" ]; then
         infiles="$infiles $i"
-    elif [ "${i:0:2}" == "-l" ]; then
+    elif [ "${i#-l}" != "$i" ]; then
         libs="$libs ${i#-l}"
-    elif [ "${i:0:2}" == "-L" ]; then
+    elif [ "${i#-L}" != "$i" ]; then
         libpaths="${libpaths} ${i#-L}"
     else
         args="${args} ${i}"
diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh
index e2566b0..bb7ab41 100755
--- a/libvpx/build/make/configure.sh
+++ b/libvpx/build/make/configure.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  configure.sh
 ##
@@ -198,11 +198,11 @@ add_extralibs() {
 #
 # Boolean Manipulation Functions
 #
-enable(){
+enable_feature(){
     set_all yes $*
 }
 
-disable(){
+disable_feature(){
     set_all no $*
 }
 
@@ -219,7 +219,7 @@ soft_enable() {
     for var in $*; do
         if ! disabled $var; then
             log_echo "  enabling $var"
-            enable $var
+            enable_feature $var
         fi
     done
 }
@@ -228,7 +228,7 @@ soft_disable() {
     for var in $*; do
         if ! enabled $var; then
             log_echo "  disabling $var"
-            disable $var
+            disable_feature $var
         fi
     done
 }
@@ -251,10 +251,10 @@ tolower(){
 # Temporary File Functions
 #
 source_path=${0%/*}
-enable source_path_used
+enable_feature source_path_used
 if test -z "$source_path" -o "$source_path" = "." ; then
     source_path="`pwd`"
-    disable source_path_used
+    disable_feature source_path_used
 fi
 
 if test ! -z "$TMPDIR" ; then
@@ -264,12 +264,13 @@ elif test ! -z "$TEMPDIR" ; then
 else
     TMPDIRx="/tmp"
 fi
-TMP_H="${TMPDIRx}/vpx-conf-$$-${RANDOM}.h"
-TMP_C="${TMPDIRx}/vpx-conf-$$-${RANDOM}.c"
-TMP_CC="${TMPDIRx}/vpx-conf-$$-${RANDOM}.cc"
-TMP_O="${TMPDIRx}/vpx-conf-$$-${RANDOM}.o"
-TMP_X="${TMPDIRx}/vpx-conf-$$-${RANDOM}.x"
-TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RANDOM}.asm"
+RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}')
+TMP_H="${TMPDIRx}/vpx-conf-$$-${RAND}.h"
+TMP_C="${TMPDIRx}/vpx-conf-$$-${RAND}.c"
+TMP_CC="${TMPDIRx}/vpx-conf-$$-${RAND}.cc"
+TMP_O="${TMPDIRx}/vpx-conf-$$-${RAND}.o"
+TMP_X="${TMPDIRx}/vpx-conf-$$-${RAND}.x"
+TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RAND}.asm"
 
 clean_temp_files() {
     rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
@@ -316,8 +317,8 @@ check_header(){
     header=$1
     shift
     var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-    disable $var
-    check_cpp "$@" <<EOF && enable $var
+    disable_feature $var
+    check_cpp "$@" <<EOF && enable_feature $var
 #include "$header"
 int x;
 EOF
@@ -479,7 +480,7 @@ process_common_cmdline() {
     for opt in "$@"; do
         optval="${opt#*=}"
         case "$opt" in
-        --child) enable child
+        --child) enable_feature child
         ;;
         --log*)
         logging="$optval"
@@ -491,7 +492,7 @@ process_common_cmdline() {
         ;;
         --target=*) toolchain="${toolchain:-${optval}}"
         ;;
-        --force-target=*) toolchain="${toolchain:-${optval}}"; enable force_toolchain
+        --force-target=*) toolchain="${toolchain:-${optval}}"; enable_feature force_toolchain
         ;;
         --cpu)
         ;;
@@ -511,7 +512,7 @@ process_common_cmdline() {
           echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
             die_unknown $opt
         fi
-        $action $option
+        ${action}_feature $option
         ;;
         --require-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
@@ -523,11 +524,11 @@ process_common_cmdline() {
         ;;
         --force-enable-?*|--force-disable-?*)
         eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'`
-        $action $option
+        ${action}_feature $option
         ;;
         --libc=*)
         [ -d "${optval}" ] || die "Not a directory: ${optval}"
-        disable builtin_libc
+        disable_feature builtin_libc
         alt_libc="${optval}"
         ;;
         --as=*)
@@ -696,13 +697,13 @@ process_common_toolchain() {
 
     # Mark the specific ISA requested as enabled
     soft_enable ${tgt_isa}
-    enable ${tgt_os}
-    enable ${tgt_cc}
+    enable_feature ${tgt_os}
+    enable_feature ${tgt_cc}
 
     # Enable the architecture family
     case ${tgt_isa} in
-        arm*) enable arm;;
-        mips*) enable mips;;
+        arm*) enable_feature arm;;
+        mips*) enable_feature mips;;
     esac
 
     # PIC is probably what we want when building shared libs
@@ -765,7 +766,7 @@ process_common_toolchain() {
     case ${toolchain} in
         sparc-solaris-*)
             add_extralibs -lposix4
-            disable fast_unaligned
+            disable_feature fast_unaligned
             ;;
         *-solaris-*)
             add_extralibs -lposix4
@@ -790,7 +791,7 @@ process_common_toolchain() {
             ;;
         armv5te)
             soft_enable edsp
-            disable fast_unaligned
+            disable_feature fast_unaligned
             ;;
         esac
 
@@ -805,7 +806,7 @@ process_common_toolchain() {
             arch_int=${arch_int%%te}
             check_add_asflags --defsym ARCHITECTURE=${arch_int}
             tune_cflags="-mtune="
-            if [ ${tgt_isa} == "armv7" ]; then
+            if [ ${tgt_isa} = "armv7" ]; then
                 if [ -z "${float_abi}" ]; then
                     check_cpp <<EOF && float_abi=hard || float_abi=softfp
 #ifndef __ARM_PCS_VFP
@@ -842,8 +843,8 @@ EOF
             asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
             AS_SFX=.s
             msvs_arch_dir=arm-msvs
-            disable multithread
-            disable unit_tests
+            disable_feature multithread
+            disable_feature unit_tests
             ;;
         rvct)
             CC=armcc
@@ -855,7 +856,7 @@ EOF
             tune_cflags="--cpu="
             tune_asflags="--cpu="
             if [ -z "${tune_cpu}" ]; then
-                if [ ${tgt_isa} == "armv7" ]; then
+                if [ ${tgt_isa} = "armv7" ]; then
                     if enabled neon
                     then
                         check_add_cflags --fpu=softvfp+vfpv3
@@ -880,8 +881,8 @@ EOF
 
         case ${tgt_os} in
         none*)
-            disable multithread
-            disable os_support
+            disable_feature multithread
+            disable_feature os_support
             ;;
 
         android*)
@@ -913,9 +914,9 @@ EOF
             # Cortex-A8 implementations (NDK Dev Guide)
             add_ldflags "-Wl,--fix-cortex-a8"
 
-            enable pic
+            enable_feature pic
             soft_enable realtime_only
-            if [ ${tgt_isa} == "armv7" ]; then
+            if [ ${tgt_isa} = "armv7" ]; then
                 soft_enable runtime_cpu_detect
             fi
             if enabled runtime_cpu_detect; then
@@ -969,7 +970,7 @@ EOF
          ;;
 
         linux*)
-            enable linux
+            enable_feature linux
             if enabled rvct; then
                 # Check if we have CodeSourcery GCC in PATH. Needed for
                 # libraries
@@ -1000,14 +1001,14 @@ EOF
         tune_cflags="-mtune="
         if enabled dspr2; then
             check_add_cflags -mips32r2 -mdspr2
-            disable fast_unaligned
+            disable_feature fast_unaligned
         fi
         check_add_cflags -march=${tgt_isa}
         check_add_asflags -march=${tgt_isa}
         check_add_asflags -KPIC
     ;;
     ppc*)
-        enable ppc
+        enable_feature ppc
         bits=${tgt_isa##ppc}
         link_with_cc=gcc
         setup_gnu_toolchain
@@ -1155,7 +1156,7 @@ EOF
     ;;
     universal*|*-gcc|generic-gnu)
         link_with_cc=gcc
-        enable gcc
+        enable_feature gcc
     setup_gnu_toolchain
     ;;
     esac
@@ -1191,7 +1192,7 @@ EOF
 
     # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
     echo "  checking here for x86inc \"${tgt_isa}\" \"$pic\" "
-    if [ ${tgt_isa} = x86_64 -o ! "$pic" == "yes" -o ! ${tgt_os:0:6} = darwin ]; then
+    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o "${tgt_os#darwin}" = "${tgt_os}"  ]; then
       soft_enable use_x86inc
     fi
 
@@ -1204,14 +1205,14 @@ EOF
     enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0
 
     # Check for strip utility variant
-    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable gnu_strip
+    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip
 
     # Try to determine target endianness
     check_cc <<EOF
     unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
 EOF
     [ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' |
-        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable big_endian
+        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable_feature big_endian
 
     # Try to find which inline keywords are supported
     check_cc <<EOF && INLINE="inline"
@@ -1236,7 +1237,7 @@ EOF
             if enabled dspr2; then
                 if enabled big_endian; then
                     echo "dspr2 optimizations are available only for little endian platforms"
-                    disable dspr2
+                    disable_feature dspr2
                 fi
             fi
         ;;
@@ -1287,8 +1288,8 @@ print_config_h() {
 
 print_webm_license() {
     local destination=$1
-    local prefix=$2
-    local suffix=$3
+    local prefix="$2"
+    local suffix="$3"
     shift 3
     cat <<EOF > ${destination}
 ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
@@ -1309,7 +1310,7 @@ process_detect() {
     true;
 }
 
-enable logging
+enable_feature logging
 logfile="config.log"
 self=$0
 process() {
diff --git a/libvpx/build/make/gen_asm_deps.sh b/libvpx/build/make/gen_asm_deps.sh
index 0b4e3aa..6a7bff9 100755
--- a/libvpx/build/make/gen_asm_deps.sh
+++ b/libvpx/build/make/gen_asm_deps.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
diff --git a/libvpx/build/make/version.sh b/libvpx/build/make/version.sh
index 3efb956..e31e568 100755
--- a/libvpx/build/make/version.sh
+++ b/libvpx/build/make/version.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ##
diff --git a/libvpx/configure b/libvpx/configure
index 24be893..297cec4 100755
--- a/libvpx/configure
+++ b/libvpx/configure
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 ##
 ##  configure
 ##
@@ -38,6 +38,7 @@ Advanced options:
   ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
   ${toggle_mem_tracker}           track memory usage
   ${toggle_postproc}              postprocessing
+  ${toggle_vp9_postproc}          vp9 specific postprocessing
   ${toggle_multithread}           multithreaded encoding and decoding
   ${toggle_spatial_resampling}    spatial sampling (scaling) support
   ${toggle_realtime_only}         enable this option while building for real-time encoding
@@ -153,7 +154,7 @@ all_targets="libs examples docs"
 
 # all targets available are enabled, by default.
 for t in ${all_targets}; do
-    [ -f ${source_path}/${t}.mk ] && enable ${t}
+    [ -f ${source_path}/${t}.mk ] && enable_feature ${t}
 done
 
 # check installed doxygen version
@@ -164,30 +165,30 @@ if [ ${doxy_major:-0} -ge 1 ]; then
     doxy_minor=${doxy_version%%.*}
     doxy_patch=${doxy_version##*.}
 
-    [ $doxy_major -gt 1 ] && enable doxygen
-    [ $doxy_minor -gt 5 ] && enable doxygen
-    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable doxygen
+    [ $doxy_major -gt 1 ] && enable_feature doxygen
+    [ $doxy_minor -gt 5 ] && enable_feature doxygen
+    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen
 fi
 
 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
 # case.
-enabled doxygen && php -v >/dev/null 2>&1 && enable install_docs
-enable install_bins
-enable install_libs
-
-enable static
-enable optimizations
-enable fast_unaligned #allow unaligned accesses, if supported by hw
-enable md5
-enable spatial_resampling
-enable multithread
-enable os_support
-enable temporal_denoising
-
-[ -d ${source_path}/../include ] && enable alt_tree_layout
+enabled doxygen && php -v >/dev/null 2>&1 && enable_feature install_docs
+enable_feature install_bins
+enable_feature install_libs
+
+enable_feature static
+enable_feature optimizations
+enable_feature fast_unaligned #allow unaligned accesses, if supported by hw
+enable_feature md5
+enable_feature spatial_resampling
+enable_feature multithread
+enable_feature os_support
+enable_feature temporal_denoising
+
+[ -d ${source_path}/../include ] && enable_feature alt_tree_layout
 for d in vp8 vp9; do
-    [ -d ${source_path}/${d} ] && disable alt_tree_layout;
+    [ -d ${source_path}/${d} ] && disable_feature alt_tree_layout;
 done
 
 if ! enabled alt_tree_layout; then
@@ -200,10 +201,10 @@ else
 [ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp8_decoder"
 [ -f ${source_path}/../include/vpx/vp9cx.h ] && CODECS="${CODECS} vp9_encoder"
 [ -f ${source_path}/../include/vpx/vp9dx.h ] && CODECS="${CODECS} vp9_decoder"
-[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable vp8_encoder
-[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable vp8_decoder
-[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable vp9_encoder
-[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable vp9_decoder
+[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable_feature vp8_encoder
+[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable_feature vp8_decoder
+[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable_feature vp9_encoder
+[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable_feature vp9_decoder
 
 [ -f ${source_path}/../lib/*/*mt.lib ] && soft_enable static_msvcrt
 fi
@@ -279,6 +280,7 @@ CONFIG_LIST="
     dc_recon
     runtime_cpu_detect
     postproc
+    vp9_postproc
     multithread
     internal_stats
     ${CODECS}
@@ -314,6 +316,7 @@ CMDLINE_SELECT="
     gprof
     gcov
     pic
+    use_x86inc
     optimizations
     ccache
     runtime_cpu_detect
@@ -332,6 +335,7 @@ CMDLINE_SELECT="
     dequant_tokens
     dc_recon
     postproc
+    vp9_postproc
     multithread
     internal_stats
     ${CODECS}
@@ -357,12 +361,12 @@ process_cmdline() {
     for opt do
         optval="${opt#*=}"
         case "$opt" in
-        --disable-codecs) for c in ${CODECS}; do disable $c; done ;;
+        --disable-codecs) for c in ${CODECS}; do disable_feature $c; done ;;
         --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
         if echo "${EXPERIMENT_LIST}" | grep "^ *$option\$" >/dev/null; then
             if enabled experimental; then
-                $action $option
+                ${action}_feature $option
             else
                 log_echo "Ignoring $opt -- not in experimental mode."
             fi
@@ -383,8 +387,8 @@ post_process_cmdline() {
     # If the codec family is enabled, enable all components of that family.
     log_echo "Configuring selected codecs"
     for c in ${CODECS}; do
-        disabled ${c%%_*} && disable ${c}
-        enabled ${c%%_*} && enable ${c}
+        disabled ${c%%_*} && disable_feature ${c}
+        enabled ${c%%_*} && enable_feature ${c}
     done
 
     # Enable all detected codecs, if they haven't been disabled
@@ -392,12 +396,12 @@ post_process_cmdline() {
 
     # Enable the codec family if any component of that family is enabled
     for c in ${CODECS}; do
-        enabled $c && enable ${c%_*}
+        enabled $c && enable_feature ${c%_*}
     done
 
     # Set the {en,de}coders variable if any algorithm in that class is enabled
     for c in ${CODECS}; do
-        enabled ${c} && enable ${c##*_}s
+        enabled ${c} && enable_feature ${c##*_}s
     done
 }
 
@@ -437,7 +441,7 @@ process_targets() {
     done
     enabled debug_libs && DIST_DIR="${DIST_DIR}-debug"
     enabled codec_srcs && DIST_DIR="${DIST_DIR}-src"
-    ! enabled postproc && DIST_DIR="${DIST_DIR}-nopost"
+    ! enabled postproc && ! enabled vp9_postproc && DIST_DIR="${DIST_DIR}-nopost"
     ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt"
     ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs"
     DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}"
@@ -507,13 +511,13 @@ process_detect() {
     fi
     if [ -z "$CC" ] || enabled external_build; then
         echo "Bypassing toolchain for environment detection."
-        enable external_build
+        enable_feature external_build
         check_header() {
             log fake_check_header "$@"
             header=$1
             shift
             var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-            disable $var
+            disable_feature $var
             # Headers common to all environments
             case $header in
                 stdio.h)
@@ -525,7 +529,7 @@ process_detect() {
                         [ -f "${d##-I}/$header" ] && result=true && break
                     done
                     ${result:-true}
-            esac && enable $var
+            esac && enable_feature $var
 
             # Specialize windows and POSIX environments.
             case $toolchain in
@@ -533,7 +537,7 @@ process_detect() {
                     case $header-$toolchain in
                         stdint*-gcc) true;;
                         *) false;;
-                    esac && enable $var
+                    esac && enable_feature $var
                     ;;
                 *)
                     case $header in
@@ -542,7 +546,7 @@ process_detect() {
                         sys/mman.h) true;;
                         unistd.h) true;;
                         *) false;;
-                    esac && enable $var
+                    esac && enable_feature $var
             esac
             enabled $var
         }
@@ -560,7 +564,7 @@ EOF
     check_header sys/mman.h
     check_header unistd.h # for sysconf(3) and friends.
 
-    check_header vpx/vpx_integer.h -I${source_path} && enable vpx_ports
+    check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports
 }
 
 process_toolchain() {
@@ -642,14 +646,18 @@ process_toolchain() {
     # ccache only really works on gcc toolchains
     enabled gcc || soft_disable ccache
     if enabled mips; then
-        enable dequant_tokens
-        enable dc_recon
+        enable_feature dequant_tokens
+        enable_feature dc_recon
+    fi
+
+    if enabled internal_stats; then
+        enable_feature vp9_postproc
     fi
 
     # Enable the postbuild target if building for visual studio.
     case "$tgt_cc" in
-        vs*) enable msvs
-             enable solution
+        vs*) enable_feature msvs
+             enable_feature solution
              vs_version=${tgt_cc##vs}
              case $vs_version in
              [789])
diff --git a/libvpx/examples.mk b/libvpx/examples.mk
index 5b5ca23..c17fac9 100644
--- a/libvpx/examples.mk
+++ b/libvpx/examples.mk
@@ -49,6 +49,9 @@ vpxenc.DESCRIPTION           = Full featured encoder
 UTILS-$(CONFIG_VP8_ENCODER)    += vp8_scalable_patterns.c
 vp8_scalable_patterns.GUID   = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C
 vp8_scalable_patterns.DESCRIPTION = Temporal Scalability Encoder
+UTILS-$(CONFIG_VP8_ENCODER)    += vp9_spatial_scalable_encoder.c
+vp8_scalable_patterns.GUID   = 4A38598D-627D-4505-9C7B-D4020C84100D
+vp8_scalable_patterns.DESCRIPTION = Spatial Scalable Encoder
 
 # Clean up old ivfenc, ivfdec binaries.
 ifeq ($(CONFIG_MSVS),yes)
diff --git a/libvpx/test/acm_random.h b/libvpx/test/acm_random.h
index cd33d12..de94186 100644
--- a/libvpx/test/acm_random.h
+++ b/libvpx/test/acm_random.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef LIBVPX_TEST_ACM_RANDOM_H_
-#define LIBVPX_TEST_ACM_RANDOM_H_
+#ifndef TEST_ACM_RANDOM_H_
+#define TEST_ACM_RANDOM_H_
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -59,4 +59,4 @@ class ACMRandom {
 
 }  // namespace libvpx_test
 
-#endif  // LIBVPX_TEST_ACM_RANDOM_H_
+#endif  // TEST_ACM_RANDOM_H_
diff --git a/libvpx/test/borders_test.cc b/libvpx/test/borders_test.cc
index 7bfece8..dcdedcf 100644
--- a/libvpx/test/borders_test.cc
+++ b/libvpx/test/borders_test.cc
@@ -29,8 +29,8 @@ class BordersTest : public ::libvpx_test::EncoderTest,
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if ( video->frame() == 1) {
-      encoder->Control(VP8E_SET_CPUUSED, 0);
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, 1);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
       encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
       encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
diff --git a/libvpx/test/clear_system_state.h b/libvpx/test/clear_system_state.h
index e240981..8f08a4c 100644
--- a/libvpx/test/clear_system_state.h
+++ b/libvpx/test/clear_system_state.h
@@ -10,7 +10,7 @@
 #ifndef TEST_CLEAR_SYSTEM_STATE_H_
 #define TEST_CLEAR_SYSTEM_STATE_H_
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 # include "vpx_ports/x86.h"
diff --git a/libvpx/test/convolve_test.cc b/libvpx/test/convolve_test.cc
index b1510c6..3100571 100644
--- a/libvpx/test/convolve_test.cc
+++ b/libvpx/test/convolve_test.cc
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <string.h>
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -187,7 +188,7 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
 
  protected:
   static const int kDataAlignment = 16;
-  static const int kOuterBlockSize = 128;
+  static const int kOuterBlockSize = 256;
   static const int kInputStride = kOuterBlockSize;
   static const int kOutputStride = kOuterBlockSize;
   static const int kMaxDimension = 64;
@@ -224,6 +225,10 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
       input_[i] = prng.Rand8Extremes();
   }
 
+  void SetConstantInput(int value) {
+    memset(input_, value, kInputBufferSize);
+  }
+
   void CheckGuardBlocks() {
     for (int i = 0; i < kOutputBufferSize; ++i) {
       if (IsIndexInBorder(i))
@@ -456,45 +461,86 @@ DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = {
     { 128}
 };
 
+/* This test exercises the horizontal and vertical filter functions. */
 TEST_P(ConvolveTest, ChangeFilterWorks) {
   uint8_t* const in = input();
   uint8_t* const out = output();
+
+  /* Assume that the first input sample is at the 8/16th position. */
+  const int kInitialSubPelOffset = 8;
+
+  /* Filters are 8-tap, so the first filter tap will be applied to the pixel
+   * at position -3 with respect to the current filtering position. Since
+   * kInitialSubPelOffset is set to 8, we first select sub-pixel filter 8,
+   * which is non-zero only in the last tap. So, applying the filter at the
+   * current input position will result in an output equal to the pixel at
+   * offset +4 (-3 + 7) with respect to the current filtering position.
+   */
   const int kPixelSelected = 4;
 
+  /* Assume that each output pixel requires us to step on by 17/16th pixels in
+   * the input.
+   */
+  const int kInputPixelStep = 17;
+
+  /* The filters are setup in such a way that the expected output produces
+   * sets of 8 identical output samples. As the filter position moves to the
+   * next 1/16th pixel position the only active (=128) filter tap moves one
+   * position to the left, resulting in the same input pixel being replicated
+   * in to the output for 8 consecutive samples. After each set of 8 positions
+   * the filters select a different input pixel. kFilterPeriodAdjust below
+   * computes which input pixel is written to the output for a specified
+   * x or y position.
+   */
+
+  /* Test the horizontal filter. */
   REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride,
-                                 kChangeFilters[8], 17, kChangeFilters[4], 16,
-                                 Width(), Height()));
+                                 kChangeFilters[kInitialSubPelOffset],
+                                 kInputPixelStep, NULL, 0, Width(), Height()));
 
   for (int x = 0; x < Width(); ++x) {
-    const int kQ4StepAdjust = x >> 4;
     const int kFilterPeriodAdjust = (x >> 3) << 3;
-    const int ref_x = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
-    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x;
+    const int ref_x =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjust * kInputPixelStep)
+                          >> SUBPEL_BITS);
+    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x << "width = " << Width();
   }
 
+  /* Test the vertical filter. */
   REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride,
-                                 kChangeFilters[4], 16, kChangeFilters[8], 17,
-                                 Width(), Height()));
+                                 NULL, 0, kChangeFilters[kInitialSubPelOffset],
+                                 kInputPixelStep, Width(), Height()));
 
   for (int y = 0; y < Height(); ++y) {
-    const int kQ4StepAdjust = y >> 4;
     const int kFilterPeriodAdjust = (y >> 3) << 3;
-    const int ref_y = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
+    const int ref_y =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjust * kInputPixelStep)
+                          >> SUBPEL_BITS);
     ASSERT_EQ(in[ref_y * kInputStride], out[y * kInputStride]) << "y == " << y;
   }
 
+  /* Test the horizontal and vertical filters in combination. */
   REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                                  kChangeFilters[8], 17, kChangeFilters[8], 17,
+                                  kChangeFilters[kInitialSubPelOffset],
+                                  kInputPixelStep,
+                                  kChangeFilters[kInitialSubPelOffset],
+                                  kInputPixelStep,
                                   Width(), Height()));
 
   for (int y = 0; y < Height(); ++y) {
-    const int kQ4StepAdjustY = y >> 4;
     const int kFilterPeriodAdjustY = (y >> 3) << 3;
-    const int ref_y = kQ4StepAdjustY + kFilterPeriodAdjustY + kPixelSelected;
+    const int ref_y =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjustY * kInputPixelStep)
+                          >> SUBPEL_BITS);
     for (int x = 0; x < Width(); ++x) {
-      const int kQ4StepAdjustX = x >> 4;
       const int kFilterPeriodAdjustX = (x >> 3) << 3;
-      const int ref_x = kQ4StepAdjustX + kFilterPeriodAdjustX + kPixelSelected;
+      const int ref_x =
+          kPixelSelected + ((kInitialSubPelOffset
+              + kFilterPeriodAdjustX * kInputPixelStep)
+                            >> SUBPEL_BITS);
 
       ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x])
           << "x == " << x << ", y == " << y;
@@ -502,6 +548,34 @@ TEST_P(ConvolveTest, ChangeFilterWorks) {
   }
 }
 
+/* This test exercises that enough rows and columns are filtered with every
+   possible initial fractional positions and scaling steps. */
+TEST_P(ConvolveTest, CheckScalingFiltering) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+
+  SetConstantInput(127);
+
+  for (int frac = 0; frac < 16; ++frac) {
+    for (int step = 1; step <= 32; ++step) {
+      /* Test the horizontal and vertical filters in combination. */
+      REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
+                                      vp9_sub_pel_filters_8[frac], step,
+                                      vp9_sub_pel_filters_8[frac], step,
+                                      Width(), Height()));
+
+      CheckGuardBlocks();
+
+      for (int y = 0; y < Height(); ++y) {
+        for (int x = 0; x < Width(); ++x) {
+          ASSERT_EQ(in[y * kInputStride + x], out[y * kOutputStride + x])
+              << "x == " << x << ", y == " << y
+              << ", frac == " << frac << ", step == " << step;
+        }
+      }
+    }
+  }
+}
 
 using std::tr1::make_tuple;
 
diff --git a/libvpx/test/cpu_speed_test.cc b/libvpx/test/cpu_speed_test.cc
index e6ad75b..c92e723 100644
--- a/libvpx/test/cpu_speed_test.cc
+++ b/libvpx/test/cpu_speed_test.cc
@@ -108,5 +108,5 @@ using std::tr1::make_tuple;
 VP9_INSTANTIATE_TEST_CASE(
     CpuSpeedTest,
     ::testing::Values(::libvpx_test::kTwoPassGood),
-    ::testing::Range(0, 3));
+    ::testing::Range(0, 5));
 }  // namespace
diff --git a/libvpx/test/datarate_test.cc b/libvpx/test/datarate_test.cc
index 287e805..f020a99 100644
--- a/libvpx/test/datarate_test.cc
+++ b/libvpx/test/datarate_test.cc
@@ -75,7 +75,7 @@ class DatarateTest : public ::libvpx_test::EncoderTest,
     bits_in_buffer_model_ -= frame_size_in_bits;
 
     // Update the running total of bits for end of test datarate checks.
-    bits_total_ += frame_size_in_bits ;
+    bits_total_ += frame_size_in_bits;
 
     // If first drop not set and we have a drop set it to this time.
     if (!first_drop_ && duration > 1)
diff --git a/libvpx/test/dct16x16_test.cc b/libvpx/test/dct16x16_test.cc
index 0795054..7d49c12 100644
--- a/libvpx/test/dct16x16_test.cc
+++ b/libvpx/test/dct16x16_test.cc
@@ -13,15 +13,16 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
 
 extern "C" {
 #include "vp9/common/vp9_entropy.h"
-#include "vp9_rtcd.h"
-void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
+#include "./vp9_rtcd.h"
+void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *output, int pitch);
 }
-
-#include "acm_random.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
@@ -31,12 +32,13 @@ namespace {
 #ifdef _MSC_VER
 static int round(double x) {
   if (x < 0)
-    return (int)ceil(x - 0.5);
+    return static_cast<int>(ceil(x - 0.5));
   else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif
 
+const int kNumCoeffs = 256;
 const double PI = 3.1415926535898;
 void reference2_16x16_idct_2d(double *input, double *output) {
   double x;
@@ -45,7 +47,9 @@ void reference2_16x16_idct_2d(double *input, double *output) {
       double s = 0;
       for (int i = 0; i < 16; ++i) {
         for (int j = 0; j < 16; ++j) {
-          x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/256;
+          x = cos(PI * j * (l + 0.5) / 16.0) *
+              cos(PI * i * (k + 0.5) / 16.0) *
+              input[i * 16 + j] / 256;
           if (i != 0)
             x *= sqrt(2.0);
           if (j != 0)
@@ -59,23 +63,23 @@ void reference2_16x16_idct_2d(double *input, double *output) {
 }
 
 
-static const double C1 = 0.995184726672197;
-static const double C2 = 0.98078528040323;
-static const double C3 = 0.956940335732209;
-static const double C4 = 0.923879532511287;
-static const double C5 = 0.881921264348355;
-static const double C6 = 0.831469612302545;
-static const double C7 = 0.773010453362737;
-static const double C8 = 0.707106781186548;
-static const double C9 = 0.634393284163646;
-static const double C10 = 0.555570233019602;
-static const double C11 = 0.471396736825998;
-static const double C12 = 0.38268343236509;
-static const double C13 = 0.290284677254462;
-static const double C14 = 0.195090322016128;
-static const double C15 = 0.098017140329561;
-
-static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
+const double C1 = 0.995184726672197;
+const double C2 = 0.98078528040323;
+const double C3 = 0.956940335732209;
+const double C4 = 0.923879532511287;
+const double C5 = 0.881921264348355;
+const double C6 = 0.831469612302545;
+const double C7 = 0.773010453362737;
+const double C8 = 0.707106781186548;
+const double C9 = 0.634393284163646;
+const double C10 = 0.555570233019602;
+const double C11 = 0.471396736825998;
+const double C12 = 0.38268343236509;
+const double C13 = 0.290284677254462;
+const double C14 = 0.195090322016128;
+const double C15 = 0.098017140329561;
+
+void butterfly_16x16_dct_1d(double input[16], double output[16]) {
   double step[16];
   double intermediate[16];
   double temp1, temp2;
@@ -108,36 +112,36 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
   output[6] = step[1] - step[6];
   output[7] = step[0] - step[7];
 
-  temp1 = step[ 8]*C7;
-  temp2 = step[15]*C9;
+  temp1 = step[ 8] * C7;
+  temp2 = step[15] * C9;
   output[ 8] = temp1 + temp2;
 
-  temp1 = step[ 9]*C11;
-  temp2 = step[14]*C5;
+  temp1 = step[ 9] * C11;
+  temp2 = step[14] * C5;
   output[ 9] = temp1 - temp2;
 
-  temp1 = step[10]*C3;
-  temp2 = step[13]*C13;
+  temp1 = step[10] * C3;
+  temp2 = step[13] * C13;
   output[10] = temp1 + temp2;
 
-  temp1 = step[11]*C15;
-  temp2 = step[12]*C1;
+  temp1 = step[11] * C15;
+  temp2 = step[12] * C1;
   output[11] = temp1 - temp2;
 
-  temp1 = step[11]*C1;
-  temp2 = step[12]*C15;
+  temp1 = step[11] * C1;
+  temp2 = step[12] * C15;
   output[12] = temp2 + temp1;
 
-  temp1 = step[10]*C13;
-  temp2 = step[13]*C3;
+  temp1 = step[10] * C13;
+  temp2 = step[13] * C3;
   output[13] = temp2 - temp1;
 
-  temp1 = step[ 9]*C5;
-  temp2 = step[14]*C11;
+  temp1 = step[ 9] * C5;
+  temp2 = step[14] * C11;
   output[14] = temp2 + temp1;
 
-  temp1 = step[ 8]*C9;
-  temp2 = step[15]*C7;
+  temp1 = step[ 8] * C9;
+  temp2 = step[15] * C7;
   output[15] = temp2 - temp1;
 
   // step 3
@@ -146,20 +150,20 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
   step[ 2] = output[1] - output[2];
   step[ 3] = output[0] - output[3];
 
-  temp1 = output[4]*C14;
-  temp2 = output[7]*C2;
+  temp1 = output[4] * C14;
+  temp2 = output[7] * C2;
   step[ 4] = temp1 + temp2;
 
-  temp1 = output[5]*C10;
-  temp2 = output[6]*C6;
+  temp1 = output[5] * C10;
+  temp2 = output[6] * C6;
   step[ 5] = temp1 + temp2;
 
-  temp1 = output[5]*C6;
-  temp2 = output[6]*C10;
+  temp1 = output[5] * C6;
+  temp2 = output[6] * C10;
   step[ 6] = temp2 - temp1;
 
-  temp1 = output[4]*C2;
-  temp2 = output[7]*C14;
+  temp1 = output[4] * C2;
+  temp2 = output[7] * C14;
   step[ 7] = temp2 - temp1;
 
   step[ 8] = output[ 8] + output[11];
@@ -176,18 +180,18 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
   output[ 0] = (step[ 0] + step[ 1]);
   output[ 8] = (step[ 0] - step[ 1]);
 
-  temp1 = step[2]*C12;
-  temp2 = step[3]*C4;
+  temp1 = step[2] * C12;
+  temp2 = step[3] * C4;
   temp1 = temp1 + temp2;
-  output[ 4] = 2*(temp1*C8);
+  output[ 4] = 2*(temp1 * C8);
 
-  temp1 = step[2]*C4;
-  temp2 = step[3]*C12;
+  temp1 = step[2] * C4;
+  temp2 = step[3] * C12;
   temp1 = temp2 - temp1;
-  output[12] = 2*(temp1*C8);
+  output[12] = 2 * (temp1 * C8);
 
-  output[ 2] = 2*((step[4] + step[ 5])*C8);
-  output[14] = 2*((step[7] - step[ 6])*C8);
+  output[ 2] = 2 * ((step[4] + step[ 5]) * C8);
+  output[14] = 2 * ((step[7] - step[ 6]) * C8);
 
   temp1 = step[4] - step[5];
   temp2 = step[6] + step[7];
@@ -197,17 +201,17 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
   intermediate[8] = step[8] + step[14];
   intermediate[9] = step[9] + step[15];
 
-  temp1 = intermediate[8]*C12;
-  temp2 = intermediate[9]*C4;
+  temp1 = intermediate[8] * C12;
+  temp2 = intermediate[9] * C4;
   temp1 = temp1 - temp2;
-  output[3] = 2*(temp1*C8);
+  output[3] = 2 * (temp1 * C8);
 
-  temp1 = intermediate[8]*C4;
-  temp2 = intermediate[9]*C12;
+  temp1 = intermediate[8] * C4;
+  temp2 = intermediate[9] * C12;
   temp1 = temp2 + temp1;
-  output[13] = 2*(temp1*C8);
+  output[13] = 2 * (temp1 * C8);
 
-  output[ 9] = 2*((step[10] + step[11])*C8);
+  output[ 9] = 2 * ((step[10] + step[11]) * C8);
 
   intermediate[11] = step[10] - step[11];
   intermediate[12] = step[12] + step[13];
@@ -218,207 +222,300 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
   output[15] = (intermediate[11] + intermediate[12]);
   output[ 1] = -(intermediate[11] - intermediate[12]);
 
-  output[ 7] = 2*(intermediate[13]*C8);
+  output[ 7] = 2 * (intermediate[13] * C8);
 
-  temp1 = intermediate[14]*C12;
-  temp2 = intermediate[15]*C4;
+  temp1 = intermediate[14] * C12;
+  temp2 = intermediate[15] * C4;
   temp1 = temp1 - temp2;
-  output[11] = -2*(temp1*C8);
+  output[11] = -2 * (temp1 * C8);
 
-  temp1 = intermediate[14]*C4;
-  temp2 = intermediate[15]*C12;
+  temp1 = intermediate[14] * C4;
+  temp2 = intermediate[15] * C12;
   temp1 = temp2 + temp1;
-  output[ 5] = 2*(temp1*C8);
+  output[ 5] = 2 * (temp1 * C8);
 }
 
-static void reference_16x16_dct_1d(double in[16], double out[16]) {
-  const double kPi = 3.141592653589793238462643383279502884;
-  const double kInvSqrt2 = 0.707106781186547524400844362104;
-  for (int k = 0; k < 16; k++) {
-    out[k] = 0.0;
-    for (int n = 0; n < 16; n++)
-      out[k] += in[n]*cos(kPi*(2*n+1)*k/32.0);
-    if (k == 0)
-      out[k] = out[k]*kInvSqrt2;
-  }
-}
-
-void reference_16x16_dct_2d(int16_t input[16*16], double output[16*16]) {
+void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
   // First transform columns
   for (int i = 0; i < 16; ++i) {
     double temp_in[16], temp_out[16];
     for (int j = 0; j < 16; ++j)
-      temp_in[j] = input[j*16 + i];
+      temp_in[j] = input[j * 16 + i];
     butterfly_16x16_dct_1d(temp_in, temp_out);
     for (int j = 0; j < 16; ++j)
-      output[j*16 + i] = temp_out[j];
+      output[j * 16 + i] = temp_out[j];
   }
   // Then transform rows
   for (int i = 0; i < 16; ++i) {
     double temp_in[16], temp_out[16];
     for (int j = 0; j < 16; ++j)
-      temp_in[j] = output[j + i*16];
+      temp_in[j] = output[j + i * 16];
     butterfly_16x16_dct_1d(temp_in, temp_out);
     // Scale by some magic number
     for (int j = 0; j < 16; ++j)
-      output[j + i*16] = temp_out[j]/2;
+      output[j + i * 16] = temp_out[j]/2;
   }
 }
 
-void fdct16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-               int stride, int /*tx_type*/) {
+typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
+typedef void (*idct_t)(int16_t *in, uint8_t *out, int stride);
+typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
+typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type);
+
+void fdct16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
   vp9_short_fdct16x16_c(in, out, stride);
 }
-void idct16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                   int stride, int /*tx_type*/) {
-  vp9_short_idct16x16_add_c(out, dst, stride >> 1);
-}
-void fht16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-              int stride, int tx_type) {
-  // FIXME(jingning): need to test both SSE2 and c
-#if HAVE_SSE2
-  vp9_short_fht16x16_sse2(in, out, stride >> 1, tx_type);
-#else
-  vp9_short_fht16x16_c(in, out, stride >> 1, tx_type);
-#endif
-}
-void iht16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-  vp9_short_iht16x16_add_c(out, dst, stride >> 1, tx_type);
+
+void fht16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
+  vp9_short_fht16x16_c(in, out, stride, tx_type);
 }
 
-class FwdTrans16x16Test : public ::testing::TestWithParam<int> {
+class Trans16x16TestBase {
  public:
-  virtual ~FwdTrans16x16Test() {}
+  virtual ~Trans16x16TestBase() {}
 
-  virtual void SetUp() {
-    tx_type_ = GetParam();
-    if (tx_type_ == 0) {
-      fwd_txfm = fdct16x16;
-      inv_txfm = idct16x16_add;
-    } else {
-      fwd_txfm = fht16x16;
-      inv_txfm = iht16x16_add;
+ protected:
+  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
+
+  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
+
+  void RunAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+    for (int i = 0; i < count_test_block; ++i) {
+      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        test_input_block[j] = src[j] - dst[j];
+      }
+
+      REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
+                                      test_temp_block, pitch_));
+      REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = dst[j] - src[j];
+        const uint32_t error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+      }
     }
-  }
 
- protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*fwd_txfm)(in, out, dst, stride, tx_type);
-  }
-  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*inv_txfm)(in, out, dst, stride, tx_type);
+    EXPECT_GE(1u, max_error)
+        << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
+
+    EXPECT_GE(count_test_block , total_error)
+        << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
   }
 
-  int tx_type_;
-  void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
-  void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
-};
+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
 
-TEST_P(FwdTrans16x16Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int max_error = 0;
-  double total_error = 0;
-  const int count_test_block = 10000;
-  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 256);
-
-    for (int j = 0; j < 256; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
+    for (int i = 0; i < count_test_block; ++i) {
       // Initialize a test block with input range [-255, 255].
-      test_input_block[j] = src[j] - dst[j];
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
+      REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
     }
+  }
 
-    const int pitch = 32;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
 
-    for (int j = 0; j < 256; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        input_block[j] = rnd.Rand8() - rnd.Rand8();
+        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+      }
+      if (i == 0)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = 255;
+      if (i == 1)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = -255;
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+      REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
+                                      output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      }
     }
   }
 
-  EXPECT_GE(1, max_error)
-      << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
+  void RunInvAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
 
-  EXPECT_GE(count_test_block , total_error)
-      << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
-}
+    for (int i = 0; i < count_test_block; ++i) {
+      double out_r[kNumCoeffs];
 
-TEST_P(FwdTrans16x16Test, CoeffSizeCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_extreme_block, 256);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
-
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-    }
-    if (i == 0)
-      for (int j = 0; j < 256; ++j)
-        input_extreme_block[j] = 255;
-
-    const int pitch = 32;
-    RunFwdTxfm(input_block, output_block, dst, pitch, tx_type_);
-    RunFwdTxfm(input_extreme_block, output_extreme_block, dst, pitch, tx_type_);
-
-    // The minimum quant value is 4.
-    for (int j = 0; j < 256; ++j) {
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_extreme_block[j]))
-          << "Error: 16x16 FDCT extreme has coefficient larger "
-          << "than 4*DCT_MAX_VALUE";
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        in[j] = src[j] - dst[j];
+      }
+
+      reference_16x16_dct_2d(in, out_r);
+      for (int j = 0; j < kNumCoeffs; ++j)
+        coeff[j] = round(out_r[j]);
+
+      const int pitch = 32;
+      REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = dst[j] - src[j];
+        const uint32_t error = diff * diff;
+        EXPECT_GE(1u, error)
+            << "Error: 16x16 IDCT has error " << error
+            << " at index " << j;
+      }
     }
   }
+  int pitch_;
+  int tx_type_;
+  fht_t fwd_txfm_ref;
+};
+
+class Trans16x16DCT : public Trans16x16TestBase,
+                      public PARAMS(fdct_t, idct_t, int) {
+ public:
+  virtual ~Trans16x16DCT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 32;
+    fwd_txfm_ref = fdct16x16_ref;
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride >> 1);
+  }
+
+  fdct_t fwd_txfm_;
+  idct_t inv_txfm_;
+};
+
+TEST_P(Trans16x16DCT, AccuracyCheck) {
+  RunAccuracyCheck();
 }
 
-INSTANTIATE_TEST_CASE_P(VP9, FwdTrans16x16Test, ::testing::Range(0, 4));
+TEST_P(Trans16x16DCT, CoeffCheck) {
+  RunCoeffCheck();
+}
 
-TEST(VP9Idct16x16Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t in[256], coeff[256];
-    uint8_t dst[256], src[256];
-    double out_r[256];
+TEST_P(Trans16x16DCT, MemCheck) {
+  RunMemCheck();
+}
 
-    for (int j = 0; j < 256; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j)
-      in[j] = src[j] - dst[j];
-
-    reference_16x16_dct_2d(in, out_r);
-    for (int j = 0; j < 256; j++)
-      coeff[j] = round(out_r[j]);
-    vp9_short_idct16x16_add_c(coeff, dst, 16);
-    for (int j = 0; j < 256; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      EXPECT_GE(1, error)
-          << "Error: 16x16 IDCT has error " << error
-          << " at index " << j;
-    }
+TEST_P(Trans16x16DCT, InvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
+class Trans16x16HT : public Trans16x16TestBase,
+                     public PARAMS(fht_t, iht_t, int) {
+ public:
+  virtual ~Trans16x16HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 16;
+    fwd_txfm_ref = fht16x16_ref;
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
   }
+
+  fht_t fwd_txfm_;
+  iht_t inv_txfm_;
+};
+
+TEST_P(Trans16x16HT, AccuracyCheck) {
+  RunAccuracyCheck();
+}
+
+TEST_P(Trans16x16HT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+TEST_P(Trans16x16HT, MemCheck) {
+  RunMemCheck();
 }
 
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct16x16_c, &vp9_short_idct16x16_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 0),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 1),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 2),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 3)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct16x16_sse2, &vp9_short_idct16x16_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 0),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 1),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 2),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 3)));
+#endif
 }  // namespace
diff --git a/libvpx/test/dct32x32_test.cc b/libvpx/test/dct32x32_test.cc
index e05d482..f331886 100644
--- a/libvpx/test/dct32x32_test.cc
+++ b/libvpx/test/dct32x32_test.cc
@@ -13,15 +13,17 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
 
 extern "C" {
+#include "./vpx_config.h"
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
-  void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
-  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
 }
 
-#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
@@ -30,35 +32,15 @@ namespace {
 #ifdef _MSC_VER
 static int round(double x) {
   if (x < 0)
-    return (int)ceil(x - 0.5);
+    return static_cast<int>(ceil(x - 0.5));
   else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif
 
-static const double kPi = 3.141592653589793238462643383279502884;
-static void reference2_32x32_idct_2d(double *input, double *output) {
-  double x;
-  for (int l = 0; l < 32; ++l) {
-    for (int k = 0; k < 32; ++k) {
-      double s = 0;
-      for (int i = 0; i < 32; ++i) {
-        for (int j = 0; j < 32; ++j) {
-          x = cos(kPi * j * (l + 0.5) / 32.0) *
-              cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024;
-          if (i != 0)
-            x *= sqrt(2.0);
-          if (j != 0)
-            x *= sqrt(2.0);
-          s += x;
-        }
-      }
-      output[k * 32 + l] = s / 4;
-    }
-  }
-}
-
-static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
+const int kNumCoeffs = 1024;
+const double kPi = 3.141592653589793238462643383279502884;
+void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
   const double kInvSqrt2 = 0.707106781186547524400844362104;
   for (int k = 0; k < 32; k++) {
     out[k] = 0.0;
@@ -69,7 +51,8 @@ static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
   }
 }
 
-static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
+void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
+                            double output[kNumCoeffs]) {
   // First transform columns
   for (int i = 0; i < 32; ++i) {
     double temp_in[32], temp_out[32];
@@ -91,102 +74,189 @@ static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
   }
 }
 
-TEST(VP9Idct32x32Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t in[1024], coeff[1024];
-    uint8_t dst[1024], src[1024];
-    double out_r[1024];
+typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride);
+typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride);
 
-    for (int j = 0; j < 1024; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j)
-      in[j] = src[j] - dst[j];
-
-    reference_32x32_dct_2d(in, out_r);
-    for (int j = 0; j < 1024; j++)
-      coeff[j] = round(out_r[j]);
-    vp9_short_idct32x32_add_c(coeff, dst, 32);
-    for (int j = 0; j < 1024; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      EXPECT_GE(1, error)
-          << "Error: 32x32 IDCT has error " << error
-          << " at index " << j;
-    }
+class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
+ public:
+  virtual ~Trans32x32Test() {}
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    version_  = GET_PARAM(2);  // 0: high precision forward transform
+                               // 1: low precision version for rd loop
   }
-}
 
-TEST(VP9Fdct32x32Test, AccuracyCheck) {
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int version_;
+  fwd_txfm_t fwd_txfm_;
+  inv_txfm_t inv_txfm_;
+};
+
+TEST_P(Trans32x32Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  unsigned int max_error = 0;
+  uint32_t max_error = 0;
   int64_t total_error = 0;
   const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[1024];
-    int16_t test_temp_block[1024];
-    uint8_t dst[1024], src[1024];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
 
-    for (int j = 0; j < 1024; ++j) {
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < kNumCoeffs; ++j) {
       src[j] = rnd.Rand8();
       dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j)
       test_input_block[j] = src[j] - dst[j];
+    }
 
     const int pitch = 64;
-    vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
+    REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch));
+    REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
 
-    for (int j = 0; j < 1024; ++j) {
-      const unsigned diff = dst[j] - src[j];
-      const unsigned error = diff * diff;
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      const uint32_t diff = dst[j] - src[j];
+      const uint32_t error = diff * diff;
       if (max_error < error)
         max_error = error;
       total_error += error;
     }
   }
 
+  if (version_ == 1) {
+    max_error /= 2;
+    total_error /= 45;
+  }
+
   EXPECT_GE(1u, max_error)
-      << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";
+      << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
 
   EXPECT_GE(count_test_block, total_error)
-      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1 per block";
+      << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
 }
 
-TEST(VP9Fdct32x32Test, CoeffSizeCheck) {
+TEST_P(Trans32x32Test, CoeffCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 1000;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
   for (int i = 0; i < count_test_block; ++i) {
-    int16_t input_block[1024], input_extreme_block[1024];
-    int16_t output_block[1024], output_extreme_block[1024];
+    for (int j = 0; j < kNumCoeffs; ++j)
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_block, output_ref_block, pitch);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch));
+
+    if (version_ == 0) {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+    } else {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+    }
+  }
+}
+
+TEST_P(Trans32x32Test, MemCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 2000;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
 
+  for (int i = 0; i < count_test_block; ++i) {
     // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j) {
+    for (int j = 0; j < kNumCoeffs; ++j) {
       input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+      input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
     }
     if (i == 0)
-      for (int j = 0; j < 1024; ++j)
+      for (int j = 0; j < kNumCoeffs; ++j)
         input_extreme_block[j] = 255;
+    if (i == 1)
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_extreme_block[j] = -255;
 
     const int pitch = 64;
-    vp9_short_fdct32x32_c(input_block, output_block, pitch);
-    vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch);
+    vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch));
 
     // The minimum quant value is 4.
-    for (int j = 0; j < 1024; ++j) {
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
-          << "Error: 32x32 FDCT extreme has coefficient larger than "
-             "4*DCT_MAX_VALUE";
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (version_ == 0) {
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+      } else {
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+      }
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))
+          << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 32x32 FDCT has coefficient larger than "
+          << "4*DCT_MAX_VALUE";
     }
   }
 }
+
+TEST_P(Trans32x32Test, InverseAccuracy) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    double out_r[kNumCoeffs];
+
+    // Initialize a test block with input range [-255, 255]
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+      in[j] = src[j] - dst[j];
+    }
+
+    reference_32x32_dct_2d(in, out_r);
+    for (int j = 0; j < kNumCoeffs; ++j)
+      coeff[j] = round(out_r[j]);
+    REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      const int diff = dst[j] - src[j];
+      const int error = diff * diff;
+      EXPECT_GE(1, error)
+          << "Error: 32x32 IDCT has error " << error
+          << " at index " << j;
+    }
+  }
+}
+
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct32x32_c, &vp9_short_idct32x32_add_c, 0),
+        make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_short_idct32x32_add_c, 1)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct32x32_sse2,
+                   &vp9_short_idct32x32_add_sse2, 0),
+        make_tuple(&vp9_short_fdct32x32_rd_sse2,
+                   &vp9_short_idct32x32_add_sse2, 1)));
+#endif
 }  // namespace
diff --git a/libvpx/test/decode_test_driver.h b/libvpx/test/decode_test_driver.h
index 49e7384..055c45e 100644
--- a/libvpx/test/decode_test_driver.h
+++ b/libvpx/test/decode_test_driver.h
@@ -12,7 +12,7 @@
 #define TEST_DECODE_TEST_DRIVER_H_
 #include <cstring>
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
 
 namespace libvpx_test {
@@ -36,9 +36,8 @@ class DxDataIterator {
 };
 
 // Provides a simplified interface to manage one video decoding.
-//
-// TODO: similar to Encoder class, the exact services should be
-// added as more tests are added.
+// Similar to Encoder class, the exact services should be added
+// as more tests are added.
 class Decoder {
  public:
   Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
diff --git a/libvpx/test/encode_test_driver.cc b/libvpx/test/encode_test_driver.cc
index eed3e33..709831e 100644
--- a/libvpx/test/encode_test_driver.cc
+++ b/libvpx/test/encode_test_driver.cc
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/decode_test_driver.h"
@@ -114,19 +114,19 @@ static bool compare_img(const vpx_image_t *img1,
   const unsigned int height_y = img1->d_h;
   unsigned int i;
   for (i = 0; i < height_y; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
-                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                     width_y) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                    img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                    width_y) == 0) && match;
   const unsigned int width_uv  = (img1->d_w + 1) >> 1;
   const unsigned int height_uv = (img1->d_h + 1) >> 1;
   for (i = 0; i <  height_uv; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
-                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
-                     width_uv) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                    img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                    width_uv) == 0) && match;
   for (i = 0; i < height_uv; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
-                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
-                     width_uv) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                    img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                    width_uv) == 0) && match;
   return match;
 }
 
@@ -158,7 +158,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
     Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
     bool again;
     for (again = true, video->Begin(); again; video->Next()) {
-      again = video->img() != NULL;
+      again = (video->img() != NULL);
 
       PreEncodeFrameHook(video);
       PreEncodeFrameHook(video, encoder);
diff --git a/libvpx/test/error_resilience_test.cc b/libvpx/test/error_resilience_test.cc
index d4a6967..16d250c 100644
--- a/libvpx/test/error_resilience_test.cc
+++ b/libvpx/test/error_resilience_test.cc
@@ -62,7 +62,7 @@ class ErrorResilienceTest : public ::libvpx_test::EncoderTest,
     if (droppable_nframes_ > 0 &&
         (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
       for (unsigned int i = 0; i < droppable_nframes_; ++i) {
-        if (droppable_frames_[i] == nframes_) {
+        if (droppable_frames_[i] == video->frame()) {
           std::cout << "             Encoding droppable frame: "
                     << droppable_frames_[i] << "\n";
           frame_flags_ |= (VP8_EFLAG_NO_UPD_LAST |
@@ -148,7 +148,7 @@ TEST_P(ErrorResilienceTest, OnVersusOff) {
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
   cfg_.rc_target_bitrate = 2000;
-  cfg_.g_lag_in_frames = 25;
+  cfg_.g_lag_in_frames = 10;
 
   init_flags_ = VPX_CODEC_USE_PSNR;
 
@@ -179,6 +179,9 @@ TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) {
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
   cfg_.rc_target_bitrate = 500;
+  // FIXME(debargha): Fix this to work for any lag.
+  // Currently this test only works for lag = 0
+  cfg_.g_lag_in_frames = 0;
 
   init_flags_ = VPX_CODEC_USE_PSNR;
 
diff --git a/libvpx/test/fdct4x4_test.cc b/libvpx/test/fdct4x4_test.cc
index 9dcc078..ea40ca6 100644
--- a/libvpx/test/fdct4x4_test.cc
+++ b/libvpx/test/fdct4x4_test.cc
@@ -15,10 +15,10 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 }
 
-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
@@ -136,7 +136,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
   int max_error = 0;
-  double total_error = 0;
+  int total_error = 0;
   const int count_test_block = 1000000;
   for (int i = 0; i < count_test_block; ++i) {
     DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
@@ -156,7 +156,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
     RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
 
     for (int j = 0; j < 16; ++j) {
-        if(test_temp_block[j] > 0) {
+        if (test_temp_block[j] > 0) {
           test_temp_block[j] += 2;
           test_temp_block[j] /= 4;
           test_temp_block[j] *= 4;
diff --git a/libvpx/test/fdct8x8_test.cc b/libvpx/test/fdct8x8_test.cc
index 50e2e9d..ee6c9f6 100644
--- a/libvpx/test/fdct8x8_test.cc
+++ b/libvpx/test/fdct8x8_test.cc
@@ -13,14 +13,16 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
 #include "vpx_ports/mem.h"
 
 extern "C" {
-#include "vp9_rtcd.h"
-void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
+#include "./vp9_rtcd.h"
+void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *output, int pitch);
 }
 
-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
@@ -62,6 +64,7 @@ class FwdTrans8x8Test : public ::testing::TestWithParam<int> {
       inv_txfm = iht8x8_add;
     }
   }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
   void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
@@ -92,8 +95,9 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 64; ++j)
       test_input_block[j] = rnd.Rand8() - rnd.Rand8();
-
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_output_block,
+                   NULL, pitch, tx_type_));
 
     for (int j = 0; j < 64; ++j) {
       if (test_output_block[j] < 0)
@@ -121,8 +125,9 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
     // Initialize a test block with input range [-15, 15].
     for (int j = 0; j < 64; ++j)
       test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
-
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_output_block,
+                   NULL, pitch, tx_type_));
 
     for (int j = 0; j < 64; ++j) {
       if (test_output_block[j] < 0)
@@ -148,7 +153,7 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) {
 TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int max_error = 0;
-  double total_error = 0;
+  int total_error = 0;
   const int count_test_block = 100000;
   for (int i = 0; i < count_test_block; ++i) {
     DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
@@ -165,9 +170,11 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
       test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 16;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    for (int j = 0; j < 64; ++j){
-        if(test_temp_block[j] > 0) {
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
+    for (int j = 0; j < 64; ++j) {
+        if (test_temp_block[j] > 0) {
           test_temp_block[j] += 2;
           test_temp_block[j] /= 4;
           test_temp_block[j] *= 4;
@@ -177,7 +184,9 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
           test_temp_block[j] *= 4;
         }
     }
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunInvTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
 
     for (int j = 0; j < 64; ++j) {
       const int diff = dst[j] - src[j];
@@ -199,7 +208,7 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
 TEST_P(FwdTrans8x8Test, ExtremalCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int max_error = 0;
-  double total_error = 0;
+  int total_error = 0;
   const int count_test_block = 100000;
   for (int i = 0; i < count_test_block; ++i) {
     DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
@@ -216,8 +225,12 @@ TEST_P(FwdTrans8x8Test, ExtremalCheck) {
       test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 16;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
+    REGISTER_STATE_CHECK(
+        RunInvTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
 
     for (int j = 0; j < 64; ++j) {
       const int diff = dst[j] - src[j];
diff --git a/libvpx/test/i420_video_source.h b/libvpx/test/i420_video_source.h
index bcbe8a7..2bf2a03 100644
--- a/libvpx/test/i420_video_source.h
+++ b/libvpx/test/i420_video_source.h
@@ -11,6 +11,7 @@
 #define TEST_I420_VIDEO_SOURCE_H_
 #include <cstdio>
 #include <cstdlib>
+#include <string>
 
 #include "test/video_source.h"
 
@@ -34,7 +35,6 @@ class I420VideoSource : public VideoSource {
         height_(0),
         framerate_numerator_(rate_numerator),
         framerate_denominator_(rate_denominator) {
-
     // This initializes raw_sz_, width_, height_ and allocates an img.
     SetSize(width, height);
   }
diff --git a/libvpx/test/idct8x8_test.cc b/libvpx/test/idct8x8_test.cc
index 67db78b..fc8129e 100644
--- a/libvpx/test/idct8x8_test.cc
+++ b/libvpx/test/idct8x8_test.cc
@@ -15,10 +15,10 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 }
 
-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
@@ -27,10 +27,10 @@ namespace {
 
 #ifdef _MSC_VER
 static int round(double x) {
-  if(x < 0)
-    return (int)ceil(x - 0.5);
+  if (x < 0)
+    return static_cast<int>(ceil(x - 0.5));
   else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif
 
diff --git a/libvpx/test/idct_test.cc b/libvpx/test/idct_test.cc
index aa786cb..2c7fa0e 100644
--- a/libvpx/test/idct_test.cc
+++ b/libvpx/test/idct_test.cc
@@ -16,7 +16,9 @@ extern "C" {
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
+#include "vpx/vpx_integer.h"
+
+typedef void (*idct_fn_t)(int16_t *input, unsigned char *pred_ptr,
                           int pred_stride, unsigned char *dst_ptr,
                           int dst_stride);
 namespace {
@@ -34,7 +36,7 @@ class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
   idct_fn_t UUT;
-  short input[16];
+  int16_t input[16];
   unsigned char output[256];
   unsigned char predict[256];
 };
diff --git a/libvpx/test/intrapred_test.cc b/libvpx/test/intrapred_test.cc
index da96741..f5f6d5b 100644
--- a/libvpx/test/intrapred_test.cc
+++ b/libvpx/test/intrapred_test.cc
@@ -15,8 +15,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -106,9 +106,9 @@ class IntraPredBase {
           for (int y = 0; y < block_size_; y++)
             sum += data_ptr_[p][y * stride_ - 1];
         expected = (sum + (1 << (shift - 1))) >> shift;
-      } else
+      } else {
         expected = 0x80;
-
+      }
       // check that all subsequent lines are equal to the first
       for (int y = 1; y < block_size_; ++y)
         ASSERT_EQ(0, memcmp(data_ptr_[p], &data_ptr_[p][y * stride_],
diff --git a/libvpx/test/ivf_video_source.h b/libvpx/test/ivf_video_source.h
index 926f801..3fbafbd 100644
--- a/libvpx/test/ivf_video_source.h
+++ b/libvpx/test/ivf_video_source.h
@@ -28,7 +28,7 @@ static unsigned int MemGetLe32(const uint8_t *mem) {
 // so that we can do actual file decodes.
 class IVFVideoSource : public CompressedVideoSource {
  public:
-  IVFVideoSource(const std::string &file_name)
+  explicit IVFVideoSource(const std::string &file_name)
       : file_name_(file_name),
         input_file_(NULL),
         compressed_frame_buf_(NULL),
diff --git a/libvpx/test/keyframe_test.cc b/libvpx/test/keyframe_test.cc
index f7572e8..7ee2898 100644
--- a/libvpx/test/keyframe_test.cc
+++ b/libvpx/test/keyframe_test.cc
@@ -132,7 +132,6 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
   // Verify that keyframes match the file keyframes in the file.
   for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
        iter != kf_pts_list_.end(); ++iter) {
-
     if (deadline_ == VPX_DL_REALTIME && *iter > 0)
       EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
         << *iter;
diff --git a/libvpx/test/md5_helper.h b/libvpx/test/md5_helper.h
index fc1a974..289f608 100644
--- a/libvpx/test/md5_helper.h
+++ b/libvpx/test/md5_helper.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef LIBVPX_TEST_MD5_HELPER_H_
-#define LIBVPX_TEST_MD5_HELPER_H_
+#ifndef TEST_MD5_HELPER_H_
+#define TEST_MD5_HELPER_H_
 
 extern "C" {
 #include "./md5_utils.h"
@@ -25,9 +25,15 @@ class MD5 {
 
   void Add(const vpx_image_t *img) {
     for (int plane = 0; plane < 3; ++plane) {
-      uint8_t *buf = img->planes[plane];
-      const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
-      const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;
+      const uint8_t *buf = img->planes[plane];
+      // Calculate the width and height to do the md5 check. For the chroma
+      // plane, we never want to round down and thus skip a pixel so if
+      // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
+      // This works only for chroma_shift of 0 and 1.
+      const int h = plane ? (img->d_h + img->y_chroma_shift) >>
+                    img->y_chroma_shift : img->d_h;
+      const int w = plane ? (img->d_w + img->x_chroma_shift) >>
+                    img->x_chroma_shift : img->d_w;
 
       for (int y = 0; y < h; ++y) {
         MD5Update(&md5_, buf, w);
@@ -61,4 +67,4 @@ class MD5 {
 
 }  // namespace libvpx_test
 
-#endif  // LIBVPX_TEST_MD5_HELPER_H_
+#endif  // TEST_MD5_HELPER_H_
diff --git a/libvpx/test/pp_filter_test.cc b/libvpx/test/pp_filter_test.cc
index 79896fe..e5ac9db 100644
--- a/libvpx/test/pp_filter_test.cc
+++ b/libvpx/test/pp_filter_test.cc
@@ -11,8 +11,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -63,7 +63,8 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
   // Pointers to top-left pixel of block in the input and output images.
   uint8_t *const src_image_ptr = src_image + (input_stride << 1);
   uint8_t *const dst_image_ptr = dst_image + 8;
-  uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  uint8_t *const flimits =
+      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
   (void)vpx_memset(flimits, 255, block_width);
 
   // Initialize pixels in the input:
diff --git a/libvpx/test/register_state_check.h b/libvpx/test/register_state_check.h
index fb3f53b..479a42d 100644
--- a/libvpx/test/register_state_check.h
+++ b/libvpx/test/register_state_check.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef LIBVPX_TEST_REGISTER_STATE_CHECK_H_
-#define LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#ifndef TEST_REGISTER_STATE_CHECK_H_
+#define TEST_REGISTER_STATE_CHECK_H_
 
 #ifdef _WIN64
 
@@ -92,4 +92,4 @@ class RegisterStateCheck {};
 
 #endif  // _WIN64
 
-#endif  // LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#endif  // TEST_REGISTER_STATE_CHECK_H_
diff --git a/libvpx/test/resize_test.cc b/libvpx/test/resize_test.cc
index 7412a24..77d3f5c 100644
--- a/libvpx/test/resize_test.cc
+++ b/libvpx/test/resize_test.cc
@@ -124,6 +124,13 @@ TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 10);
   init_flags_ = VPX_CODEC_USE_PSNR;
+
+  // If the number of frames being encoded is smaller than g_lag_in_frames
+  // the encoded frame is unavailable using the current API. Comparing
+  // frames to detect mismatch would then not be possible. Set
+  // g_lag_in_frames = 0 to get around this.
+  cfg_.g_lag_in_frames = 0;
+
   // q picked such that initial keyframe on this clip is ~30dB PSNR
   cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
diff --git a/libvpx/test/sad_test.cc b/libvpx/test/sad_test.cc
index bf3e0b8..453b3a8 100644
--- a/libvpx/test/sad_test.cc
+++ b/libvpx/test/sad_test.cc
@@ -17,7 +17,6 @@ extern "C" {
 #include "./vpx_config.h"
 #if CONFIG_VP8_ENCODER
 #include "./vp8_rtcd.h"
-//#include "vp8/common/blockd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 #include "./vp9_rtcd.h"
@@ -428,6 +427,7 @@ INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
 
 #if HAVE_SSE
 #if CONFIG_VP9_ENCODER
+#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse;
 const sad_m_by_n_fn_t sad_4x8_sse_vp9 = vp9_sad4x8_sse;
 INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values(
@@ -441,6 +441,7 @@ INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
                         make_tuple(4, 4, sad_4x4x4d_sse)));
 #endif
 #endif
+#endif
 
 #if HAVE_SSE2
 #if CONFIG_VP8_ENCODER
@@ -451,6 +452,7 @@ const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt;
 const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;
 #endif
 #if CONFIG_VP9_ENCODER
+#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
 const sad_m_by_n_fn_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
 const sad_m_by_n_fn_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
@@ -463,6 +465,7 @@ const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
 const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
 const sad_m_by_n_fn_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
 #endif
+#endif
 const sad_m_by_n_test_param_t sse2_tests[] = {
 #if CONFIG_VP8_ENCODER
   make_tuple(16, 16, sad_16x16_wmt),
@@ -472,6 +475,7 @@ const sad_m_by_n_test_param_t sse2_tests[] = {
   make_tuple(4, 4, sad_4x4_wmt),
 #endif
 #if CONFIG_VP9_ENCODER
+#if CONFIG_USE_X86INC
   make_tuple(64, 64, sad_64x64_sse2_vp9),
   make_tuple(64, 32, sad_64x32_sse2_vp9),
   make_tuple(32, 64, sad_32x64_sse2_vp9),
@@ -484,10 +488,12 @@ const sad_m_by_n_test_param_t sse2_tests[] = {
   make_tuple(8, 8, sad_8x8_sse2_vp9),
   make_tuple(8, 4, sad_8x4_sse2_vp9),
 #endif
+#endif
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
 #if CONFIG_VP9_ENCODER
+#if CONFIG_USE_X86INC
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2;
@@ -513,6 +519,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
                         make_tuple(8, 4, sad_8x4x4d_sse2)));
 #endif
 #endif
+#endif
 
 #if HAVE_SSE3
 #if CONFIG_VP8_ENCODER
@@ -531,9 +538,11 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values(
 #endif
 
 #if HAVE_SSSE3
+#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3;
 INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
                         make_tuple(16, 16, sad_16x16_sse3)));
 #endif
+#endif
 
 }  // namespace
diff --git a/libvpx/test/set_roi.cc b/libvpx/test/set_roi.cc
index 3b6112e..9d2e771 100644
--- a/libvpx/test/set_roi.cc
+++ b/libvpx/test/set_roi.cc
@@ -17,15 +17,19 @@
 #include <sys/types.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 extern "C" {
 #include "vp8/encoder/onyx_int.h"
 }
 
+using libvpx_test::ACMRandom;
+
 namespace {
 
 TEST(Vp8RoiMapTest, ParameterCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
   int delta_q[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
   int delta_lf[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
   unsigned int threshold[MAX_MB_SEGMENTS] = { 0, 100, 200, 300 };
@@ -121,10 +125,10 @@ TEST(Vp8RoiMapTest, ParameterCheck) {
     for (int i = 0; i < 1000; ++i) {
       int rand_deltas[4];
       int deltas_valid;
-      rand_deltas[0] = (rand() % 160) - 80;
-      rand_deltas[1] = (rand() % 160) - 80;
-      rand_deltas[2] = (rand() % 160) - 80;
-      rand_deltas[3] = (rand() % 160) - 80;
+      rand_deltas[0] = rnd(160) - 80;
+      rand_deltas[1] = rnd(160) - 80;
+      rand_deltas[2] = rnd(160) - 80;
+      rand_deltas[3] = rnd(160) - 80;
 
       deltas_valid = ((abs(rand_deltas[0]) <= 63) &&
                       (abs(rand_deltas[1]) <= 63) &&
diff --git a/libvpx/test/subtract_test.cc b/libvpx/test/subtract_test.cc
index 574bfbf..d1f2729 100644
--- a/libvpx/test/subtract_test.cc
+++ b/libvpx/test/subtract_test.cc
@@ -13,8 +13,8 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vp8/encoder/block.h"
 #include "vpx_mem/vpx_mem.h"
@@ -51,7 +51,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
   bd.predictor = reinterpret_cast<unsigned char*>(
       vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));
 
-  for(int i = 0; kSrcStride[i] > 0; ++i) {
+  for (int i = 0; kSrcStride[i] > 0; ++i) {
     // start at block0
     be.src = 0;
     be.base_src = &source;
diff --git a/libvpx/test/test-data.sha1 b/libvpx/test/test-data.sha1
index 0ac4905..370ffc1 100644
--- a/libvpx/test/test-data.sha1
+++ b/libvpx/test/test-data.sha1
@@ -520,3 +520,7 @@ d17bc08eedfc60c4c23d576a6c964a21bf854d1f  vp90-2-03-size-226x202.webm
 83c6d8f2969b759e10e5c6542baca1265c874c29  vp90-2-03-size-226x224.webm.md5
 fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce  vp90-2-03-size-226x226.webm
 94ad19b8b699cea105e2ff18f0df2afd7242bcf7  vp90-2-03-size-226x226.webm.md5
+495256cfd123fe777b2c0406862ed8468a1f4677  vp91-2-04-yv444.webm
+65e3a7ffef61ab340d9140f335ecc49125970c2c  vp91-2-04-yv444.webm.md5
+b6524e4084d15b5d0caaa3d3d1368db30cbee69c  vp90-2-03-deltaq.webm
+65f45ec9a55537aac76104818278e0978f94a678  vp90-2-03-deltaq.webm.md5
diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk
index 25e05b9..a64c0b8 100644
--- a/libvpx/test/test.mk
+++ b/libvpx/test/test.mk
@@ -24,7 +24,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
 
@@ -629,3 +629,7 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
diff --git a/libvpx/test/test_libvpx.cc b/libvpx/test/test_libvpx.cc
index 5610c26..a4dbca4 100644
--- a/libvpx/test/test_libvpx.cc
+++ b/libvpx/test/test_libvpx.cc
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <string>
-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 #include "vpx_ports/x86.h"
@@ -48,7 +48,9 @@ int main(int argc, char **argv) {
 #endif
 
 #if !CONFIG_SHARED
-  /* Shared library builds don't support whitebox tests that exercise internal symbols. */
+// Shared library builds don't support whitebox tests
+// that exercise internal symbols.
+
 #if CONFIG_VP8
   vp8_rtcd();
 #endif
diff --git a/libvpx/test/test_vector_test.cc b/libvpx/test/test_vector_test.cc
index 9b0e9d5..9bd03b9 100644
--- a/libvpx/test/test_vector_test.cc
+++ b/libvpx/test/test_vector_test.cc
@@ -159,7 +159,10 @@ const char *kVP9TestVectors[] = {
   "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
   "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
   "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
-  "vp90-2-03-size-226x226.webm"
+  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
+#if CONFIG_NON420
+  "vp91-2-04-yv444.webm"
+#endif
 };
 #endif
 
diff --git a/libvpx/test/variance_test.cc b/libvpx/test/variance_test.cc
index 207b6e7..ca53ffb 100644
--- a/libvpx/test/variance_test.cc
+++ b/libvpx/test/variance_test.cc
@@ -16,16 +16,16 @@
 #include "test/register_state_check.h"
 
 #include "vpx/vpx_integer.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #include "vpx_mem/vpx_mem.h"
 #if CONFIG_VP8_ENCODER
 # include "vp8/common/variance.h"
-# include "vp8_rtcd.h"
+# include "./vp8_rtcd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 # include "vp9/encoder/vp9_variance.h"
-# include "vp9_rtcd.h"
+# include "./vp9_rtcd.h"
 #endif
 }
 #include "test/acm_random.h"
@@ -107,8 +107,8 @@ static unsigned int subpel_avg_variance_ref(const uint8_t *ref,
 }
 
 template<typename VarianceFunctionType>
-class VarianceTest :
-    public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
+class VarianceTest
+    : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
  public:
   virtual void SetUp() {
     const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
@@ -191,9 +191,9 @@ void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
 }
 
 template<typename SubpelVarianceFunctionType>
-class SubpelVarianceTest :
-    public ::testing::TestWithParam<tuple<int, int,
-                                          SubpelVarianceFunctionType> > {
+class SubpelVarianceTest
+    : public ::testing::TestWithParam<tuple<int, int,
+                                            SubpelVarianceFunctionType> > {
  public:
   virtual void SetUp() {
     const tuple<int, int, SubpelVarianceFunctionType>& params =
@@ -483,6 +483,7 @@ INSTANTIATE_TEST_CASE_P(
 #endif
 
 #if HAVE_SSE2
+#if CONFIG_USE_X86INC
 const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
 const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
 const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
@@ -596,8 +597,11 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(6, 5, subpel_avg_variance64x32_sse2),
                       make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
 #endif
+#endif
 
 #if HAVE_SSSE3
+#if CONFIG_USE_X86INC
+
 const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 =
     vp9_sub_pixel_variance4x4_ssse3;
 const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 =
@@ -682,6 +686,7 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
                       make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
 #endif
+#endif
 #endif  // CONFIG_VP9_ENCODER
 
 }  // namespace vp9
diff --git a/libvpx/test/vp8_boolcoder_test.cc b/libvpx/test/vp8_boolcoder_test.cc
index c3a8d12..0383af2 100644
--- a/libvpx/test/vp8_boolcoder_test.cc
+++ b/libvpx/test/vp8_boolcoder_test.cc
@@ -8,10 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-extern "C" {
-#include "vp8/encoder/boolhuff.h"
-#include "vp8/decoder/dboolhuff.h"
-}
 
 #include <math.h>
 #include <stddef.h>
@@ -24,6 +20,11 @@ extern "C" {
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_integer.h"
 
+extern "C" {
+#include "vp8/encoder/boolhuff.h"
+#include "vp8/decoder/dboolhuff.h"
+}
+
 namespace {
 const int num_tests = 10;
 
@@ -44,7 +45,7 @@ void encrypt_buffer(uint8_t *buffer, int size) {
 
 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                            uint8_t *output, int count) {
-  int offset = input - (uint8_t *)decrypt_state;
+  int offset = input - reinterpret_cast<uint8_t *>(decrypt_state);
   for (int i = 0; i < count; i++) {
     output[i] = input[i] ^ secret_key[(offset + i) & 15];
   }
@@ -58,10 +59,10 @@ TEST(VP8, TestBitIO) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   for (int n = 0; n < num_tests; ++n) {
     for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int bits_to_test = 1000;
-      uint8_t probas[bits_to_test];
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];
 
-      for (int i = 0; i < bits_to_test; ++i) {
+      for (int i = 0; i < kBitsToTest; ++i) {
         const int parity = i & 1;
         probas[i] =
             (method == 0) ? 0 : (method == 1) ? 255 :
@@ -76,14 +77,14 @@ TEST(VP8, TestBitIO) {
       }
       for (int bit_method = 0; bit_method <= 3; ++bit_method) {
         const int random_seed = 6432;
-        const int buffer_size = 10000;
+        const int kBufferSize = 10000;
         ACMRandom bit_rnd(random_seed);
         BOOL_CODER bw;
-        uint8_t bw_buffer[buffer_size];
-        vp8_start_encode(&bw, bw_buffer, bw_buffer + buffer_size);
+        uint8_t bw_buffer[kBufferSize];
+        vp8_start_encode(&bw, bw_buffer, bw_buffer + kBufferSize);
 
         int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
             bit = (i & 1);
           } else if (bit_method == 3) {
@@ -98,19 +99,20 @@ TEST(VP8, TestBitIO) {
 #if CONFIG_DECRYPT
         encrypt_buffer(bw_buffer, buffer_size);
         vp8dx_start_decode(&br, bw_buffer, buffer_size,
-                           test_decrypt_cb, (void *)bw_buffer);
+                           test_decrypt_cb,
+                           reinterpret_cast<void *>(bw_buffer));
 #else
-        vp8dx_start_decode(&br, bw_buffer, buffer_size, NULL, NULL);
+        vp8dx_start_decode(&br, bw_buffer, kBufferSize, NULL, NULL);
 #endif
         bit_rnd.Reset(random_seed);
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
             bit = (i & 1);
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
           GTEST_ASSERT_EQ(vp8dx_decode_bool(&br, probas[i]), bit)
-              << "pos: "<< i << " / " << bits_to_test
+              << "pos: "<< i << " / " << kBitsToTest
               << " bit_method: " << bit_method
               << " method: " << method;
         }
diff --git a/libvpx/test/vp8_decrypt_test.cc b/libvpx/test/vp8_decrypt_test.cc
index d850f00..b092509 100644
--- a/libvpx/test/vp8_decrypt_test.cc
+++ b/libvpx/test/vp8_decrypt_test.cc
@@ -26,7 +26,8 @@ const uint8_t test_key[16] = {
   0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0
 };
 
-void encrypt_buffer(const uint8_t *src, uint8_t *dst, int size, int offset = 0) {
+void encrypt_buffer(const uint8_t *src, uint8_t *dst,
+                    int size, int offset = 0) {
   for (int i = 0; i < size; ++i) {
     dst[i] = src[i] ^ test_key[(offset + i) & 15];
   }
@@ -34,10 +35,11 @@ void encrypt_buffer(const uint8_t *src, uint8_t *dst, int size, int offset = 0)
 
 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                      uint8_t *output, int count) {
-  encrypt_buffer(input, output, count, input - (uint8_t *)decrypt_state);
+  encrypt_buffer(input, output, count,
+                 input - reinterpret_cast<uint8_t *>(decrypt_state));
 }
 
-} // namespace
+}  // namespace
 
 namespace libvpx_test {
 
diff --git a/libvpx/test/vp8_fdct4x4_test.cc b/libvpx/test/vp8_fdct4x4_test.cc
index 3c60011..c823436 100644
--- a/libvpx/test/vp8_fdct4x4_test.cc
+++ b/libvpx/test/vp8_fdct4x4_test.cc
@@ -18,7 +18,7 @@
 
 
 extern "C" {
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
 }
 
 #include "test/acm_random.h"
diff --git a/libvpx/test/vp9_boolcoder_test.cc b/libvpx/test/vp9_boolcoder_test.cc
index 42b2229..5edde90 100644
--- a/libvpx/test/vp9_boolcoder_test.cc
+++ b/libvpx/test/vp9_boolcoder_test.cc
@@ -19,7 +19,7 @@ extern "C" {
 #include "vp9/decoder/vp9_dboolhuff.h"
 }
 
-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
@@ -32,10 +32,10 @@ TEST(VP9, TestBitIO) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   for (int n = 0; n < num_tests; ++n) {
     for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int bits_to_test = 1000;
-      uint8_t probas[bits_to_test];
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];
 
-      for (int i = 0; i < bits_to_test; ++i) {
+      for (int i = 0; i < kBitsToTest; ++i) {
         const int parity = i & 1;
         probas[i] =
           (method == 0) ? 0 : (method == 1) ? 255 :
@@ -50,14 +50,14 @@ TEST(VP9, TestBitIO) {
       }
       for (int bit_method = 0; bit_method <= 3; ++bit_method) {
         const int random_seed = 6432;
-        const int buffer_size = 10000;
+        const int kBufferSize = 10000;
         ACMRandom bit_rnd(random_seed);
         vp9_writer bw;
-        uint8_t bw_buffer[buffer_size];
+        uint8_t bw_buffer[kBufferSize];
         vp9_start_encode(&bw, bw_buffer);
 
         int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
             bit = (i & 1);
           } else if (bit_method == 3) {
@@ -72,16 +72,16 @@ TEST(VP9, TestBitIO) {
         GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);
 
         vp9_reader br;
-        vp9_reader_init(&br, bw_buffer, buffer_size);
+        vp9_reader_init(&br, bw_buffer, kBufferSize);
         bit_rnd.Reset(random_seed);
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
             bit = (i & 1);
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
           GTEST_ASSERT_EQ(vp9_read(&br, probas[i]), bit)
-              << "pos: " << i << " / " << bits_to_test
+              << "pos: " << i << " / " << kBitsToTest
               << " bit_method: " << bit_method
               << " method: " << method;
         }
diff --git a/libvpx/test/vp9_subtract_test.cc b/libvpx/test/vp9_subtract_test.cc
index 2476795..332a839 100644
--- a/libvpx/test/vp9_subtract_test.cc
+++ b/libvpx/test/vp9_subtract_test.cc
@@ -39,8 +39,8 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
   // FIXME(rbultje) split in its own file
-  for (BLOCK_SIZE_TYPE bsize = BLOCK_4X4; bsize < BLOCK_SIZE_TYPES;
-       bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
+       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
     const int block_width  = 4 << b_width_log2(bsize);
     const int block_height = 4 << b_height_log2(bsize);
     int16_t *diff = reinterpret_cast<int16_t *>(
diff --git a/libvpx/vp8/common/onyx.h b/libvpx/vp8/common/onyx.h
index 766b4ea..30c4cbb 100644
--- a/libvpx/vp8/common/onyx.h
+++ b/libvpx/vp8/common/onyx.h
@@ -41,7 +41,8 @@ extern "C"
     {
         USAGE_STREAM_FROM_SERVER    = 0x0,
         USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
-        USAGE_CONSTRAINED_QUALITY   = 0x2
+        USAGE_CONSTRAINED_QUALITY   = 0x2,
+        USAGE_CONSTANT_QUALITY      = 0x3
     } END_USAGE;
 
 
diff --git a/libvpx/vp8/encoder/picklpf.c b/libvpx/vp8/encoder/picklpf.c
index 841e1e4..250d04c 100644
--- a/libvpx/vp8/encoder/picklpf.c
+++ b/libvpx/vp8/encoder/picklpf.c
@@ -313,7 +313,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
     /* Get baseline error score */
 
     /* Copy the unfiltered / processed recon buffer to the new buffer */
-    vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+    vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
 
     vp8cx_set_alt_lf_level(cpi, filt_mid);
     vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
@@ -339,7 +339,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
             if(ss_err[filt_low] == 0)
             {
                 /* Get Low filter error score */
-                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
                 vp8cx_set_alt_lf_level(cpi, filt_low);
                 vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
 
@@ -367,7 +367,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
         {
             if(ss_err[filt_high] == 0)
             {
-                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
                 vp8cx_set_alt_lf_level(cpi, filt_high);
                 vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
 
diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c
index 9a7b9c5..19e9d27 100644
--- a/libvpx/vp8/vp8_cx_iface.c
+++ b/libvpx/vp8/vp8_cx_iface.c
@@ -153,7 +153,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 #else
     RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
 #endif
-    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
+    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
     RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
     RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
     RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
@@ -204,7 +204,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
     RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
     RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
     RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
-    if(finalize && cfg->rc_end_usage == VPX_CQ)
+    if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q))
         RANGE_CHECK(vp8_cfg, cq_level,
                     cfg->rc_min_quantizer, cfg->rc_max_quantizer);
 
@@ -327,17 +327,14 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
     oxcf->resample_up_water_mark   = cfg.rc_resize_up_thresh;
     oxcf->resample_down_water_mark = cfg.rc_resize_down_thresh;
 
-    if (cfg.rc_end_usage == VPX_VBR)
-    {
-        oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
-    }
-    else if (cfg.rc_end_usage == VPX_CBR)
-    {
-        oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
-    }
-    else if (cfg.rc_end_usage == VPX_CQ)
-    {
-        oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+    if (cfg.rc_end_usage == VPX_VBR) {
+      oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
+    } else if (cfg.rc_end_usage == VPX_CBR) {
+      oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+    } else if (cfg.rc_end_usage == VPX_CQ) {
+      oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+    } else if (cfg.rc_end_usage == VPX_Q) {
+      oxcf->end_usage = USAGE_CONSTANT_QUALITY;
     }
 
     oxcf->target_bandwidth         = cfg.rc_target_bitrate;
@@ -1272,7 +1269,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
         1,                  /* g_delete_first_pass_file */
         "vp8.fpf"           /* first pass filename */
 #endif
-
+        VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
         1,                  /* ts_number_layers */
         {0},                /* ts_target_bitrate */
         {0},                /* ts_rate_decimator */
diff --git a/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm
new file mode 100644
index 0000000..7d24530
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm
@@ -0,0 +1,116 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_convolve_avg_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_convolve_avg_neon| PROC
+    push                {r4-r6, lr}
+    ldrd                r4, r5, [sp, #32]
+    mov                 r6, r2
+
+    cmp                 r4, #32
+    bgt                 avg64
+    beq                 avg32
+    cmp                 r4, #8
+    bgt                 avg16
+    beq                 avg8
+    b                   avg4
+
+avg64
+    sub                 lr, r1, #32
+    sub                 r4, r3, #32
+avg64_h
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0]!
+    vld1.8              {q2-q3}, [r0], lr
+    pld                 [r2, r3]
+    vld1.8              {q8-q9},   [r6@128]!
+    vld1.8              {q10-q11}, [r6@128], r4
+    vrhadd.u8           q0, q0, q8
+    vrhadd.u8           q1, q1, q9
+    vrhadd.u8           q2, q2, q10
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2@128]!
+    vst1.8              {q2-q3}, [r2@128], r4
+    subs                r5, r5, #1
+    bgt                 avg64_h
+    pop                 {r4-r6, pc}
+
+avg32
+    vld1.8              {q0-q1}, [r0], r1
+    vld1.8              {q2-q3}, [r0], r1
+    vld1.8              {q8-q9},   [r6@128], r3
+    vld1.8              {q10-q11}, [r6@128], r3
+    pld                 [r0]
+    vrhadd.u8           q0, q0, q8
+    pld                 [r0, r1]
+    vrhadd.u8           q1, q1, q9
+    pld                 [r6]
+    vrhadd.u8           q2, q2, q10
+    pld                 [r6, r3]
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2@128], r3
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 avg32
+    pop                 {r4-r6, pc}
+
+avg16
+    vld1.8              {q0}, [r0], r1
+    vld1.8              {q1}, [r0], r1
+    vld1.8              {q2}, [r6@128], r3
+    vld1.8              {q3}, [r6@128], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q2
+    pld                 [r6]
+    pld                 [r6, r3]
+    vrhadd.u8           q1, q1, q3
+    vst1.8              {q0}, [r2@128], r3
+    vst1.8              {q1}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 avg16
+    pop                 {r4-r6, pc}
+
+avg8
+    vld1.8              {d0}, [r0], r1
+    vld1.8              {d1}, [r0], r1
+    vld1.8              {d2}, [r6@64], r3
+    vld1.8              {d3}, [r6@64], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q1
+    pld                 [r6]
+    pld                 [r6, r3]
+    vst1.8              {d0}, [r2@64], r3
+    vst1.8              {d1}, [r2@64], r3
+    subs                r5, r5, #2
+    bgt                 avg8
+    pop                 {r4-r6, pc}
+
+avg4
+    vld1.32             {d0[0]}, [r0], r1
+    vld1.32             {d0[1]}, [r0], r1
+    vld1.32             {d2[0]}, [r6@32], r3
+    vld1.32             {d2[1]}, [r6@32], r3
+    vrhadd.u8           d0, d0, d2
+    vst1.32             {d0[0]}, [r2@32], r3
+    vst1.32             {d0[1]}, [r2@32], r3
+    subs                r5, r5, #2
+    bgt                 avg4
+    pop                 {r4-r6, pc}
+    ENDP
+
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
index 110a56c..6b20cb9 100644
--- a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
@@ -66,46 +66,64 @@
 
     vld1.s16        {q0}, [r5]              ; filter_x
 
-    add             r8, r1, r1, lsl #1      ; src_stride * 3
-    add             r8, r8, #4              ; src_stride * 3 + 4
-    rsb             r8, r8, #0              ; reset for src
+    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
+    add             r8, r8, #4              ; -src_stride * 3 + 4
 
-    add             r4, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r4, r4, #4              ; dst_stride * 3 - 4
-    rsb             r4, r4, #0              ; reset for dst
+    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
+    add             r4, r4, #4              ; -dst_stride * 3 + 4
 
-    sub             r9, r1, #8              ; post increment for src load
-
-    rsb             r1, r6, r1, lsl #2      ; reset src for outer loop
+    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
+    sub             r9, r9, #7
     rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
 
     mov             r10, r6                 ; w loop counter
 
-loop_horiz
-    vld1.8          {d24}, [r0]!
-    vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
-
-    vld1.8          {d25}, [r0]!
-    vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
-
-    vld1.8          {d26}, [r0]!
-    vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
-
-    vld1.8          {d27}, [r0]!
-    vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
+loop_horiz_v
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8
 
     vtrn.16         q12, q13
     vtrn.8          d24, d25
     vtrn.8          d26, d27
 
-    ; extract to s16
+    pld             [r0, r1, lsl #2]
+
     vmovl.u8        q8, d24
     vmovl.u8        q9, d25
     vmovl.u8        q10, d26
     vmovl.u8        q11, d27
-    vtrn.32         d28, d29 ; only the first half is populated
+
+    ; save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+loop_horiz
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    ; extract to s16
+    vtrn.32         q14, q15
     vmovl.u8        q12, d28
-    vmovl.u8        q13, d30
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]
 
     ; slightly out of order load to match the existing data
     vld1.u32        {d6[0]}, [r2], r3
@@ -116,10 +134,12 @@ loop_horiz
     sub             r2, r2, r3, lsl #2      ; reset for store
 
     ; src[] * filter_x
-    MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
-    MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
-    MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]
 
     ; += 64 >> 7
     vqrshrun.s32    d2, q1, #7
@@ -135,24 +155,29 @@ loop_horiz
     vtrn.16         d2, d3
     vtrn.32         d2, d3
     vtrn.8          d2, d3
-    
+
     ; average the new value and the dst value
     vrhadd.u8       q1, q1, q3
 
-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r4
+    vst1.u32        {d2[0]}, [r2@32], r3
+    vst1.u32        {d3[0]}, [r2@32], r3
+    vst1.u32        {d2[1]}, [r2@32], r3
+    vst1.u32        {d3[1]}, [r2@32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13
 
     subs            r6, r6, #4              ; w -= 4
     bgt             loop_horiz
 
     ; outer loop
     mov             r6, r10                 ; restore w counter
-    add             r0, r0, r1              ; src += src_stride * 4 - w
+    add             r0, r0, r9              ; src += src_stride * 4 - w
     add             r2, r2, r12             ; dst += dst_stride * 4 - w
     subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz
+    bgt loop_horiz_v
 
     pop             {r4-r10, pc}
 
@@ -163,66 +188,77 @@ loop_horiz
     cmp             r12, #16
     bne             vp9_convolve8_avg_vert_c
 
-    push            {r4-r10, lr}
+    push            {r4-r8, lr}
 
     ; adjust for taps
     sub             r0, r0, r1
     sub             r0, r0, r1, lsl #1
 
-    ldr             r7, [sp, #40]           ; filter_y
-    ldr             r8, [sp, #48]           ; w
-    ldr             r9, [sp, #52]           ; h
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h
 
-    vld1.s16        {q0}, [r7]              ; filter_y
+    vld1.s16        {q0}, [r4]              ; filter_y
 
-    mov             r5, r1, lsl #1          ; src_stride * 2
-    add             r5, r5, r1, lsl #3      ; src_stride * 10
-    sub             r5, r5, #4              ; src_stride * 10 + 4
-    rsb             r5, r5, #0              ; reset for src
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1
 
-    add             r6, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r6, r6, #4              ; dst_stride * 3 - 4
-    rsb             r6, r6, #0              ; reset for dst
+loop_vert_h
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 ; h loop counter
 
-    rsb             r7, r8, r1, lsl #2      ; reset src for outer loop
-    rsb             r12, r8, r3, lsl #2     ; reset dst for outer loop
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1
 
-    mov             r10, r8                 ; w loop counter
+    vmovl.u8        q8, d16
+    vmovl.u8        q9, d18
+    vmovl.u8        q10, d20
+    vmovl.u8        q11, d22
 
 loop_vert
     ; always process a 4x4 block at a time
-    vld1.u32        {d16[0]}, [r0], r1
-    vld1.u32        {d16[1]}, [r0], r1
-    vld1.u32        {d18[0]}, [r0], r1
-    vld1.u32        {d18[1]}, [r0], r1
-    vld1.u32        {d20[0]}, [r0], r1
-    vld1.u32        {d20[1]}, [r0], r1
-    vld1.u32        {d22[0]}, [r0], r1
-    vld1.u32        {d22[1]}, [r0], r1
-    vld1.u32        {d24[0]}, [r0], r1
-    vld1.u32        {d24[1]}, [r0], r1
-    vld1.u32        {d26[0]}, [r0], r5
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
 
     ; extract to s16
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
     vmovl.u8        q12, d24
     vmovl.u8        q13, d26
 
-    vld1.u32        {d6[0]}, [r2], r3
-    vld1.u32        {d6[1]}, [r2], r3
-    vld1.u32        {d7[0]}, [r2], r3
-    vld1.u32        {d7[1]}, [r2], r3
+    vld1.u32        {d6[0]}, [r5@32], r3
+    vld1.u32        {d6[1]}, [r8@32], r3
+    vld1.u32        {d7[0]}, [r5@32], r3
+    vld1.u32        {d7[1]}, [r8@32], r3
 
-    sub             r2, r2, r3, lsl #2      ; reset for store
+    pld             [r7]
+    pld             [r4]
 
     ; src[] * filter_y
-    MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
-    MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
-    MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r5]
+    pld             [r8]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
 
     ; += 64 >> 7
     vqrshrun.s32    d2, q1, #7
@@ -237,22 +273,30 @@ loop_vert
     ; average the new value and the dst value
     vrhadd.u8       q1, q1, q3
 
-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r6
+    sub             r5, r5, r3, lsl #1      ; reset for store
+    sub             r8, r8, r3, lsl #1
 
-    subs            r8, r8, #4              ; w -= 4
+    vst1.u32        {d2[0]}, [r5@32], r3
+    vst1.u32        {d2[1]}, [r8@32], r3
+    vst1.u32        {d3[0]}, [r5@32], r3
+    vst1.u32        {d3[1]}, [r8@32], r3
+
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
+
+    subs            r12, r12, #4            ; h -= 4
     bgt             loop_vert
 
     ; outer loop
-    mov             r8, r10                 ; restore w counter
-    add             r0, r0, r7              ; src += 4 * src_stride - w
-    add             r2, r2, r12             ; dst += 4 * dst_stride - w
-    subs            r9, r9, #4              ; h -= 4
-    bgt             loop_vert
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              ; w -= 4
+    bgt             loop_vert_h
 
-    pop             {r4-r10, pc}
+    pop             {r4-r8, pc}
 
     ENDP
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
index 845e4a8..4525845 100644
--- a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
@@ -66,52 +66,72 @@
 
     vld1.s16        {q0}, [r5]              ; filter_x
 
-    add             r8, r1, r1, lsl #1      ; src_stride * 3
-    add             r8, r8, #4              ; src_stride * 3 + 4
-    rsb             r8, r8, #0              ; reset for src
+    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
+    add             r8, r8, #4              ; -src_stride * 3 + 4
 
-    add             r4, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r4, r4, #4              ; dst_stride * 3 - 4
-    rsb             r4, r4, #0              ; reset for dst
+    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
+    add             r4, r4, #4              ; -dst_stride * 3 + 4
 
-    sub             r9, r1, #8              ; post increment for src load
-
-    rsb             r1, r6, r1, lsl #2      ; reset src for outer loop
+    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
+    sub             r9, r9, #7
     rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
 
     mov             r10, r6                 ; w loop counter
 
-loop_horiz
-    vld1.8          {d24}, [r0]!
-    vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
-
-    vld1.8          {d25}, [r0]!
-    vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
-
-    vld1.8          {d26}, [r0]!
-    vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
-
-    vld1.8          {d27}, [r0]!
-    vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
+loop_horiz_v
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8
 
     vtrn.16         q12, q13
     vtrn.8          d24, d25
     vtrn.8          d26, d27
 
-    ; extract to s16
+    pld             [r0, r1, lsl #2]
+
     vmovl.u8        q8, d24
     vmovl.u8        q9, d25
     vmovl.u8        q10, d26
     vmovl.u8        q11, d27
-    vtrn.32         d28, d29 ; only the first half is populated
+
+    ; save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+loop_horiz
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    ; extract to s16
+    vtrn.32         q14, q15
     vmovl.u8        q12, d28
-    vmovl.u8        q13, d30
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]
 
     ; src[] * filter_x
-    MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
-    MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
-    MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]
 
     ; += 64 >> 7
     vqrshrun.s32    d2, q1, #7
@@ -128,20 +148,25 @@ loop_horiz
     vtrn.32         d2, d3
     vtrn.8          d2, d3
 
-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r4
+    vst1.u32        {d2[0]}, [r2@32], r3
+    vst1.u32        {d3[0]}, [r2@32], r3
+    vst1.u32        {d2[1]}, [r2@32], r3
+    vst1.u32        {d3[1]}, [r2@32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13
 
     subs            r6, r6, #4              ; w -= 4
     bgt             loop_horiz
 
     ; outer loop
     mov             r6, r10                 ; restore w counter
-    add             r0, r0, r1              ; src += src_stride * 4 - w
+    add             r0, r0, r9              ; src += src_stride * 4 - w
     add             r2, r2, r12             ; dst += dst_stride * 4 - w
     subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz
+    bgt loop_horiz_v
 
     pop             {r4-r10, pc}
 
@@ -152,59 +177,72 @@ loop_horiz
     cmp             r12, #16
     bne             vp9_convolve8_vert_c
 
-    push            {r4-r10, lr}
+    push            {r4-r8, lr}
 
     ; adjust for taps
     sub             r0, r0, r1
     sub             r0, r0, r1, lsl #1
 
-    ldr             r7, [sp, #40]           ; filter_y
-    ldr             r8, [sp, #48]           ; w
-    ldr             r9, [sp, #52]           ; h
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h
 
-    vld1.s16        {q0}, [r7]              ; filter_y
+    vld1.s16        {q0}, [r4]              ; filter_y
 
-    mov             r5, r1, lsl #1          ; src_stride * 2
-    add             r5, r5, r1, lsl #3      ; src_stride * 10
-    sub             r5, r5, #4              ; src_stride * 10 + 4
-    rsb             r5, r5, #0              ; reset for src
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1
 
-    add             r6, r3, r3, lsl #1      ; dst_stride * 3
-    sub             r6, r6, #4              ; dst_stride * 3 - 4
-    rsb             r6, r6, #0              ; reset for dst
+loop_vert_h
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 ; h loop counter
 
-    rsb             r7, r8, r1, lsl #2      ; reset src for outer loop
-    rsb             r12, r8, r3, lsl #2     ; reset dst for outer loop
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1
 
-    mov             r10, r8                 ; w loop counter
+    vmovl.u8        q8, d16
+    vmovl.u8        q9, d18
+    vmovl.u8        q10, d20
+    vmovl.u8        q11, d22
 
 loop_vert
     ; always process a 4x4 block at a time
-    vld1.u32        {d16[0]}, [r0], r1
-    vld1.u32        {d16[1]}, [r0], r1
-    vld1.u32        {d18[0]}, [r0], r1
-    vld1.u32        {d18[1]}, [r0], r1
-    vld1.u32        {d20[0]}, [r0], r1
-    vld1.u32        {d20[1]}, [r0], r1
-    vld1.u32        {d22[0]}, [r0], r1
-    vld1.u32        {d22[1]}, [r0], r1
-    vld1.u32        {d24[0]}, [r0], r1
-    vld1.u32        {d24[1]}, [r0], r1
-    vld1.u32        {d26[0]}, [r0], r5
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
 
     ; extract to s16
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
     vmovl.u8        q12, d24
     vmovl.u8        q13, d26
 
+    pld             [r5]
+    pld             [r8]
+
     ; src[] * filter_y
-    MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
-    MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
-    MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
-    MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r7]
+    pld             [r4]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
 
     ; += 64 >> 7
     vqrshrun.s32    d2, q1, #7
@@ -216,22 +254,27 @@ loop_vert
     vqmovn.u16      d2, q1
     vqmovn.u16      d3, q2
 
-    vst1.u32        {d2[0]}, [r2], r3
-    vst1.u32        {d2[1]}, [r2], r3
-    vst1.u32        {d3[0]}, [r2], r3
-    vst1.u32        {d3[1]}, [r2], r6
+    vst1.u32        {d2[0]}, [r5@32], r3
+    vst1.u32        {d2[1]}, [r8@32], r3
+    vst1.u32        {d3[0]}, [r5@32], r3
+    vst1.u32        {d3[1]}, [r8@32], r3
+
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
 
-    subs            r8, r8, #4              ; w -= 4
+    subs            r12, r12, #4            ; h -= 4
     bgt             loop_vert
 
     ; outer loop
-    mov             r8, r10                 ; restore w counter
-    add             r0, r0, r7              ; src += 4 * src_stride - w
-    add             r2, r2, r12             ; dst += 4 * dst_stride - w
-    subs            r9, r9, #4              ; h -= 4
-    bgt             loop_vert
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              ; w -= 4
+    bgt             loop_vert_h
 
-    pop             {r4-r10, pc}
+    pop             {r4-r8, pc}
 
     ENDP
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c b/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
index 6e37ff6..d8b24bf 100644
--- a/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -10,6 +10,7 @@
 
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
+#include "vpx_ports/mem.h"
 
 void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
                         uint8_t *dst, ptrdiff_t dst_stride,
@@ -19,7 +20,7 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
   /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
    * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
    */
-  uint8_t temp[64 * 72];
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
 
   // Account for the vertical phase needing 3 lines prior and 4 lines post
   int intermediate_height = h + 7;
@@ -53,7 +54,7 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                             const int16_t *filter_x, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4,
                             int w, int h) {
-  uint8_t temp[64 * 72];
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
   int intermediate_height = h + 7;
 
   if (x_step_q4 != 16 || y_step_q4 != 16)
diff --git a/libvpx/vp9/common/arm/neon/vp9_copy_neon.asm b/libvpx/vp9/common/arm/neon/vp9_copy_neon.asm
new file mode 100644
index 0000000..a0bd04a
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_copy_neon.asm
@@ -0,0 +1,84 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_convolve_copy_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_convolve_copy_neon| PROC
+    push                {r4-r5, lr}
+    ldrd                r4, r5, [sp, #28]
+
+    cmp                 r4, #32
+    bgt                 copy64
+    beq                 copy32
+    cmp                 r4, #8
+    bgt                 copy16
+    beq                 copy8
+    b                   copy4
+
+copy64
+    sub                 lr, r1, #32
+    sub                 r3, r3, #32
+copy64_h
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0]!
+    vld1.8              {q2-q3}, [r0], lr
+    vst1.8              {q0-q1}, [r2@128]!
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #1
+    bgt                 copy64_h
+    pop                 {r4-r5, pc}
+
+copy32
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q2-q3}, [r0], r1
+    vst1.8              {q0-q1}, [r2@128], r3
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 copy32
+    pop                 {r4-r5, pc}
+
+copy16
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q1}, [r0], r1
+    vst1.8              {q0}, [r2@128], r3
+    vst1.8              {q1}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 copy16
+    pop                 {r4-r5, pc}
+
+copy8
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {d0}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {d2}, [r0], r1
+    vst1.8              {d0}, [r2@64], r3
+    vst1.8              {d2}, [r2@64], r3
+    subs                r5, r5, #2
+    bgt                 copy8
+    pop                 {r4-r5, pc}
+
+copy4
+    ldr                 r12, [r0], r1
+    str                 r12, [r2], r3
+    subs                r5, r5, #1
+    bgt                 copy4
+    pop                 {r4-r5, pc}
+    ENDP
+
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
new file mode 100644
index 0000000..3e3e400
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -0,0 +1,169 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
+                                               int16_t *output,
+                                               int output_stride);
+extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
+                                               int16_t *output,
+                                               int16_t *pass1Output,
+                                               int16_t skip_adding,
+                                               uint8_t *dest,
+                                               int dest_stride);
+extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input,
+                                               int16_t *output,
+                                               int output_stride);
+extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
+                                               int16_t *output,
+                                               int16_t *pass1Output,
+                                               int16_t skip_adding,
+                                               uint8_t *dest,
+                                               int dest_stride);
+extern void save_neon_registers();
+extern void restore_neon_registers();
+
+
+void vp9_short_idct16x16_add_neon(int16_t *input,
+                                  uint8_t *dest, int dest_stride) {
+  int16_t pass1_output[16*16] = {0};
+  int16_t row_idct_output[16*16] = {0};
+
+  // save d8-d15 register values.
+  save_neon_registers();
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct16x16_add_neon_pass2(input+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     0,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the lower 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     0,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     1,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     1,
+                                     dest+8,
+                                     dest_stride);
+
+  // restore d8-d15 register values.
+  restore_neon_registers();
+
+  return;
+}
+
+void vp9_short_idct10_16x16_add_neon(int16_t *input,
+                                  uint8_t *dest, int dest_stride) {
+  int16_t pass1_output[16*16] = {0};
+  int16_t row_idct_output[16*16] = {0};
+
+  // save d8-d15 register values.
+  save_neon_registers();
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vp9_short_idct10_16x16_add_neon_pass2(input+1,
+                                        row_idct_output,
+                                        pass1_output,
+                                        0,
+                                        dest,
+                                        dest_stride);
+
+  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     1,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     1,
+                                     dest+8,
+                                     dest_stride);
+
+  // restore d8-d15 register values.
+  restore_neon_registers();
+
+  return;
+}
diff --git a/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c b/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c
new file mode 100644
index 0000000..ceecd6f
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_common.h"
+
+// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+extern void idct32_transpose_and_transform(int16_t *transpose_buffer,
+                                           int16_t *output, int16_t *input);
+extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
+
+
+// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+extern void save_neon_registers();
+extern void restore_neon_registers();
+
+void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
+  // TODO(cd): move the creation of these buffers within the ASM file
+  // internal buffer used to transpose 8 lines into before transforming them
+  int16_t transpose_buffer[32 * 8];
+  // results of the first pass (transpose and transform rows)
+  int16_t pass1[32 * 32];
+  // results of the second pass (transpose and transform columns)
+  int16_t pass2[32 * 32];
+
+  // save register we need to preserve
+  save_neon_registers();
+  // process rows
+  idct32_transpose_and_transform(transpose_buffer, pass1, input);
+  // process columns
+  // TODO(cd): do these two steps/passes within the ASM file
+  idct32_transpose_and_transform(transpose_buffer, pass2, pass1);
+  // combine and add to dest
+  // TODO(cd): integrate this within the last storage step of the second pass
+  idct32_combine_add(dest, pass2, dest_stride);
+  // restore register we need to preserve
+  restore_neon_registers();
+}
+
+// TODO(cd): Eliminate this file altogether when everything is in ASM file
diff --git a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
index edf5786..2e8001b 100644
--- a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
@@ -361,8 +361,6 @@ v_end
 
     vand        d16, d20, d19              ; flat && mask
     vmov        r5, r6, d16
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #1                 ; Only do filter branch
 
     ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
     vabd.u8     d22, d3, d7                ; abs(p4 - p0)
@@ -388,10 +386,11 @@ v_end
 
     vmov.u8     d22, #0x80
 
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #1                 ; Only do filter branch
+
     vand        d17, d18, d16              ; flat2 && flat && mask
     vmov        r5, r6, d17
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #2                 ; Only do mbfilter branch
 
     ; mbfilter() function
 
@@ -405,15 +404,10 @@ v_end
     vmov.u8     d27, #3
 
     vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
-
     vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
-
     vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
-
     vand        d29, d29, d21              ; filter &= hev
-
     vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
-
     vmov.u8     d29, #4
 
     ; filter = clamp(filter + 3 * ( qs0 - ps0))
@@ -452,37 +446,37 @@ v_end
     vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
     vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
     vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+    vaddl.u8    q10, d4, d5
     vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+    vaddl.u8    q14, d6, d9
     vqrshrn.u16 d18, q15, #3               ; r_op2
 
-    vsubw.u8    q15, d4                    ; op1 = op2 - p3
-    vsubw.u8    q15, d5                    ; op1 -= p2
-    vaddw.u8    q15, d6                    ; op1 += p1
-    vaddw.u8    q15, d9                    ; op1 += q1
+    vsub.i16    q15, q10
+    vaddl.u8    q10, d4, d6
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d7, d10
     vqrshrn.u16 d19, q15, #3               ; r_op1
 
-    vsubw.u8    q15, d4                    ; op0 = op1 - p3
-    vsubw.u8    q15, d6                    ; op0 -= p1
-    vaddw.u8    q15, d7                    ; op0 += p0
-    vaddw.u8    q15, d10                   ; op0 += q2
+    vsub.i16    q15, q10
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d8, d11
     vqrshrn.u16 d20, q15, #3               ; r_op0
 
     vsubw.u8    q15, d4                    ; oq0 = op0 - p3
     vsubw.u8    q15, d7                    ; oq0 -= p0
-    vaddw.u8    q15, d8                    ; oq0 += q0
-    vaddw.u8    q15, d11                   ; oq0 += q3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d9, d11
     vqrshrn.u16 d21, q15, #3               ; r_oq0
 
     vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
     vsubw.u8    q15, d8                    ; oq1 -= q0
-    vaddw.u8    q15, d9                    ; oq1 += q1
-    vaddw.u8    q15, d11                   ; oq1 += q3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d10, d11
     vqrshrn.u16 d22, q15, #3               ; r_oq1
 
     vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
     vsubw.u8    q15, d9                    ; oq2 -= q1
-    vaddw.u8    q15, d10                   ; oq2 += q2
-    vaddw.u8    q15, d11                   ; oq2 += q3
+    vadd.i16    q15, q14
     vqrshrn.u16 d27, q15, #3               ; r_oq2
 
     ; Filter does not set op2 or oq2, so use p2 and q2.
@@ -501,113 +495,104 @@ v_end
     ; wide_mbfilter flat2 && flat && mask branch
     vmov.u8     d16, #7
     vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
+    vaddl.u8    q12, d2, d3
+    vaddl.u8    q13, d4, d5
+    vaddl.u8    q14, d1, d6
     vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
-    vmlal.u8    q15, d1, d29               ; op6 += p6 * 2
-    vaddw.u8    q15, d2                    ; op6 += p5
-    vaddw.u8    q15, d3                    ; op6 += p4
-    vaddw.u8    q15, d4                    ; op6 += p3
-    vaddw.u8    q15, d5                    ; op6 += p2
-    vaddw.u8    q15, d6                    ; op6 += p1
+    vadd.i16    q12, q13
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d2, d9
+    vadd.i16    q15, q12
+    vaddl.u8    q12, d0, d1
+    vaddw.u8    q15, d1
+    vaddl.u8    q13, d0, d2
+    vadd.i16    q14, q15, q14
     vqrshrn.u16 d16, q15, #4               ; w_op6
 
-    vsubw.u8    q15, d0                    ; op5 = op6 - p7
-    vsubw.u8    q15, d1                    ; op5 -= p6
-    vaddw.u8    q15, d2                    ; op5 += p5
-    vaddw.u8    q15, d9                    ; op5 += q1
+    vsub.i16    q15, q14, q12
+    vaddl.u8    q14, d3, d10
     vqrshrn.u16 d24, q15, #4               ; w_op5
 
-    vsubw.u8    q15, d0                    ; op4 = op5 - p7
-    vsubw.u8    q15, d2                    ; op4 -= p5
-    vaddw.u8    q15, d3                    ; op4 += p4
-    vaddw.u8    q15, d10                   ; op4 += q2
+    vsub.i16    q15, q13
+    vaddl.u8    q13, d0, d3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d4, d11
     vqrshrn.u16 d25, q15, #4               ; w_op4
 
-    vsubw.u8    q15, d0                    ; op3 = op4 - p7
-    vsubw.u8    q15, d3                    ; op3 -= p4
-    vaddw.u8    q15, d4                    ; op3 += p3
-    vaddw.u8    q15, d11                   ; op3 += q3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d0, d4
+    vsub.i16    q15, q13
+    vsub.i16    q14, q15, q14
     vqrshrn.u16 d26, q15, #4               ; w_op3
 
-    vsubw.u8    q15, d0                    ; op2 = op3 - p7
-    vsubw.u8    q15, d4                    ; op2 -= p3
-    vaddw.u8    q15, d5                    ; op2 += p2
+    vaddw.u8    q15, q14, d5               ; op2 += p2
+    vaddl.u8    q14, d0, d5
     vaddw.u8    q15, d12                   ; op2 += q4
+    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
     vqrshrn.u16 d27, q15, #4               ; w_op2
 
-    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d0                    ; op1 = op2 - p7
-    vsubw.u8    q15, d5                    ; op1 -= p2
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d6
     vaddw.u8    q15, d6                    ; op1 += p1
     vaddw.u8    q15, d13                   ; op1 += q5
+    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
     vqrshrn.u16 d18, q15, #4               ; w_op1
 
-    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d0                    ; op0 = op1 - p7
-    vsubw.u8    q15, d6                    ; op0 -= p1
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d7
     vaddw.u8    q15, d7                    ; op0 += p0
     vaddw.u8    q15, d14                   ; op0 += q6
+    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
     vqrshrn.u16 d19, q15, #4               ; w_op0
 
-    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d0                    ; oq0 = op0 - p7
-    vsubw.u8    q15, d7                    ; oq0 -= p0
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d1, d8
     vaddw.u8    q15, d8                    ; oq0 += q0
     vaddw.u8    q15, d15                   ; oq0 += q7
+    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
     vqrshrn.u16 d20, q15, #4               ; w_oq0
 
-    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d1                    ; oq1 = oq0 - p6
-    vsubw.u8    q15, d8                    ; oq1 -= q0
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d2, d9
     vaddw.u8    q15, d9                    ; oq1 += q1
+    vaddl.u8    q4, d10, d15
     vaddw.u8    q15, d15                   ; oq1 += q7
+    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
     vqrshrn.u16 d21, q15, #4               ; w_oq1
 
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d3, d10
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d11, d15
     vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d2                    ; oq2 = oq1 - p5
-    vsubw.u8    q15, d9                    ; oq2 -= q1
-    vaddw.u8    q15, d10                   ; oq2 += q2
-    vaddw.u8    q15, d15                   ; oq2 += q7
     vqrshrn.u16 d22, q15, #4               ; w_oq2
 
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d4, d11
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d12, d15
     vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d3                    ; oq3 = oq2 - p4
-    vsubw.u8    q15, d10                   ; oq3 -= q2
-    vaddw.u8    q15, d11                   ; oq3 += q3
-    vaddw.u8    q15, d15                   ; oq3 += q7
     vqrshrn.u16 d23, q15, #4               ; w_oq3
 
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d5, d12
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d13, d15
     vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d4                    ; oq4 = oq3 - p3
-    vsubw.u8    q15, d11                   ; oq4 -= q3
-    vaddw.u8    q15, d12                   ; oq4 += q4
-    vaddw.u8    q15, d15                   ; oq4 += q7
     vqrshrn.u16 d1, q15, #4                ; w_oq4
 
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d6, d13
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d14, d15
     vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d5                    ; oq5 = oq4 - p2
-    vsubw.u8    q15, d12                   ; oq5 -= q4
-    vaddw.u8    q15, d13                   ; oq5 += q5
-    vaddw.u8    q15, d15                   ; oq5 += q7
     vqrshrn.u16 d2, q15, #4                ; w_oq5
 
+    vsub.i16    q15, q14
     vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
-
-    vsubw.u8    q15, d6                    ; oq6 = oq5 - p1
-    vsubw.u8    q15, d13                   ; oq6 -= q5
-    vaddw.u8    q15, d14                   ; oq6 += q6
-    vaddw.u8    q15, d15                   ; oq6 += q7
-    vqrshrn.u16 d3, q15, #4                ; w_oq6
-
-    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
+    vadd.i16    q15, q4
     vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
+    vqrshrn.u16 d3, q15, #4                ; w_oq6
     vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
     vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
     vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
new file mode 100644
index 0000000..cf5c8f7
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
@@ -0,0 +1,198 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct16x16_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;                                    int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct16x16_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 6)
+    add              r0, r0, #32               ; + (1 <<((6) - 1))
+    asr              r0, r0, #6                ; >> 6
+
+    vdup.s16         q0, r0                    ; duplicate a1
+    mov              r0, #8
+    sub              r2, #8
+
+    ; load destination data row0 - row3
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row4 - row7
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row8 - row11
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row12 - row15
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vp9_short_idct16x16_1_add_neon|
+
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
new file mode 100644
index 0000000..7464e80
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
@@ -0,0 +1,1191 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_idct16x16_add_neon_pass1|
+    EXPORT  |vp9_short_idct16x16_add_neon_pass2|
+    EXPORT  |vp9_short_idct10_16x16_add_neon_pass1|
+    EXPORT  |vp9_short_idct10_16x16_add_neon_pass2|
+    EXPORT  |save_neon_registers|
+    EXPORT  |restore_neon_registers|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+    MACRO
+    TRANSPOSE8X8
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    MEND
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void |vp9_short_idct16x16_add_neon_pass1|(int16_t *input,
+;                                          int16_t *output, int output_stride)
+;
+; r0  int16_t input
+; r1  int16_t *output
+; r2  int  output_stride)
+
+; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vp9_short_idct16x16_add_neon_pass1| PROC
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q1,q2}, [r0]!
+    vmov.s16        q15, q1
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0xc00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r12, #0x3e00
+    add             r12, #0xc5
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r12                   ; duplicate cospi_4_64
+
+    ; preloading to avoid stall
+    ; generate cospi_12_64 = 13623
+    mov             r3, #0x3500
+    add             r3, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r12, #0x2300
+    add             r12, #0x8e
+
+    ; step2[4] * cospi_28_64
+    vmull.s16       q2, d18, d0
+    vmull.s16       q3, d19, d0
+
+    ; step2[4] * cospi_4_64
+    vmull.s16       q5, d18, d1
+    vmull.s16       q6, d19, d1
+
+    ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64
+    vmlal.s16       q5, d30, d0
+    vmlal.s16       q6, d31, d0
+
+    vdup.16         d2, r3                    ; duplicate cospi_12_64
+    vdup.16         d3, r12                   ; duplicate cospi_20_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d14, q5, #14              ; >> 14
+    vqrshrn.s32     d15, q6, #14              ; >> 14
+
+    ; preloading to avoid stall
+    ; generate cospi_16_64 = 11585
+    mov             r3, #0x2d00
+    add             r3, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r12, #0x1800
+    add             r12, #0x7e
+
+    ; step2[5] * cospi_12_64
+    vmull.s16       q2, d26, d2
+    vmull.s16       q3, d27, d2
+
+    ; step2[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q15, d27, d3
+
+    ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q2, d22, d3
+    vmlsl.s16       q3, d23, d3
+
+    ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q15, d23, d2
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d10, q2, #14              ; >> 14
+    vqrshrn.s32     d11, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q15, #14             ; >> 14
+
+    ; stage 4
+    vdup.16         d30, r3                   ; cospi_16_64
+
+    ; step1[0] * cospi_16_64
+    vmull.s16       q2, d16, d30
+    vmull.s16       q11, d17, d30
+
+    ; step1[1] * cospi_16_64
+    vmull.s16       q0, d24, d30
+    vmull.s16       q1, d25, d30
+
+    ; generate cospi_8_64 = 15137
+    mov             r3, #0x3b00
+    add             r3, #0x21
+
+    vdup.16         d30, r12                  ; duplicate cospi_24_64
+    vdup.16         d31, r3                   ; duplicate cospi_8_64
+
+    ; temp1 = (step1[0] + step1[1]) * cospi_16_64
+    vadd.s32        q3, q2, q0
+    vadd.s32        q12, q11, q1
+
+    ; temp2 = (step1[0] - step1[1]) * cospi_16_64
+    vsub.s32        q13, q2, q0
+    vsub.s32        q1, q11, q1
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d16, q3, #14              ; >> 14
+    vqrshrn.s32     d17, q12, #14             ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d18, q13, #14             ; >> 14
+    vqrshrn.s32     d19, q1, #14              ; >> 14
+
+    ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+    ; step1[2] * cospi_8_64
+    vmull.s16       q0, d20, d31
+    vmull.s16       q1, d21, d31
+
+    ; step1[2] * cospi_24_64
+    vmull.s16       q12, d20, d30
+    vmull.s16       q13, d21, d30
+
+    ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q0, d28, d30
+    vmlal.s16       q1, d29, d30
+
+    ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlsl.s16       q12, d28, d31
+    vmlsl.s16       q13, d29, d31
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d22, q0, #14              ; >> 14
+    vqrshrn.s32     d23, q1, #14              ; >> 14
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d20, q12, #14             ; >> 14
+    vqrshrn.s32     d21, q13, #14             ; >> 14
+
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5];
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5];
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7];
+    vadd.s16        q15, q6, q7               ; step2[7] = step1[6] + step1[7];
+
+    ; generate cospi_16_64 = 11585
+    mov             r3, #0x2d00
+    add             r3, #0x41
+
+    ; stage 5
+    vadd.s16        q0, q8, q11               ; step1[0] = step2[0] + step2[3];
+    vadd.s16        q1, q9, q10               ; step1[1] = step2[1] + step2[2];
+    vsub.s16        q2, q9, q10               ; step1[2] = step2[1] - step2[2];
+    vsub.s16        q3, q8, q11               ; step1[3] = step2[0] - step2[3];
+
+    vdup.16         d16, r3;                  ; duplicate cospi_16_64
+
+    ; step2[5] * cospi_16_64
+    vmull.s16       q11, d26, d16
+    vmull.s16       q12, d27, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
+    vsub.s32        q6, q9, q11
+    vsub.s32        q13, q10, q12
+
+    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
+    vadd.s32        q9, q9, q11
+    vadd.s32        q10, q10, q12
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d10, q6, #14              ; >> 14
+    vqrshrn.s32     d11, q13, #14             ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q10, #14             ; >> 14
+
+    ; stage 6
+    vadd.s16        q8, q0, q15                ; step2[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; step2[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; step2[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; step2[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; step2[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; step2[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; step2[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q15              ; step2[7] = step1[0] - step1[7];
+
+    ; store the data
+    vst1.64         {d16}, [r1], r2
+    vst1.64         {d17}, [r1], r2
+    vst1.64         {d18}, [r1], r2
+    vst1.64         {d19}, [r1], r2
+    vst1.64         {d20}, [r1], r2
+    vst1.64         {d21}, [r1], r2
+    vst1.64         {d22}, [r1], r2
+    vst1.64         {d23}, [r1], r2
+    vst1.64         {d24}, [r1], r2
+    vst1.64         {d25}, [r1], r2
+    vst1.64         {d26}, [r1], r2
+    vst1.64         {d27}, [r1], r2
+    vst1.64         {d28}, [r1], r2
+    vst1.64         {d29}, [r1], r2
+    vst1.64         {d30}, [r1], r2
+    vst1.64         {d31}, [r1], r2
+
+    bx              lr
+    ENDP  ; |vp9_short_idct16x16_add_neon_pass1|
+
+;void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
+;                                        int16_t *output,
+;                                        int16_t *pass1Output,
+;                                        int16_t skip_adding,
+;                                        uint8_t *dest,
+;                                        int dest_stride)
+;
+; r0  int16_t *src
+; r1  int16_t *output,
+; r2  int16_t *pass1Output,
+; r3  int16_t skip_adding,
+; r4  uint8_t *dest,
+; r5  int dest_stride)
+
+; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vp9_short_idct16x16_add_neon_pass2| PROC
+    push            {r3-r9}
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q0,q1}, [r0]!
+    vmov.s16        q15, q0;
+
+    ; generate  cospi_30_64 = 1606
+    mov             r3, #0x0600
+    add             r3, #0x46
+
+    ; generate cospi_2_64  = 16305
+    mov             r12, #0x3f00
+    add             r12, #0xb1
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         d12, r3                   ; duplicate cospi_30_64
+    vdup.16         d13, r12                  ; duplicate cospi_2_64
+
+    ; preloading to avoid stall
+    ; generate cospi_14_64 = 12665
+    mov             r3, #0x3100
+    add             r3, #0x79
+
+    ; generate cospi_18_64 = 10394
+    mov             r12, #0x2800
+    add             r12, #0x9a
+
+    ; step1[8] * cospi_30_64
+    vmull.s16       q2, d16, d12
+    vmull.s16       q3, d17, d12
+
+    ; step1[8] * cospi_2_64
+    vmull.s16       q1, d16, d13
+    vmull.s16       q4, d17, d13
+
+    ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64
+    vmlsl.s16       q2, d30, d13
+    vmlsl.s16       q3, d31, d13
+
+    ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64
+    vmlal.s16       q1, d30, d12
+    vmlal.s16       q4, d31, d12
+
+    vdup.16         d30, r3                   ; duplicate cospi_14_64
+    vdup.16         d31, r12                  ; duplicate cospi_18_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d0, q2, #14               ; >> 14
+    vqrshrn.s32     d1, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d14, q1, #14              ; >> 14
+    vqrshrn.s32     d15, q4, #14              ; >> 14
+
+    ; preloading to avoid stall
+    ; generate cospi_22_64 = 7723
+    mov             r3, #0x1e00
+    add             r3, #0x2b
+
+    ; generate cospi_10_64 = 14449
+    mov             r12, #0x3800
+    add             r12, #0x71
+
+    ; step1[9] * cospi_14_64
+    vmull.s16       q2, d24, d30
+    vmull.s16       q3, d25, d30
+
+    ; step1[9] * cospi_18_64
+    vmull.s16       q4, d24, d31
+    vmull.s16       q5, d25, d31
+
+    ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64
+    vmlsl.s16       q2, d22, d31
+    vmlsl.s16       q3, d23, d31
+
+    ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64
+    vmlal.s16       q4, d22, d30
+    vmlal.s16       q5, d23, d30
+
+    vdup.16         d30, r3                   ; duplicate cospi_22_64
+    vdup.16         d31, r12                  ; duplicate cospi_10_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d2, q2, #14               ; >> 14
+    vqrshrn.s32     d3, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q4, #14              ; >> 14
+    vqrshrn.s32     d13, q5, #14              ; >> 14
+
+    ; step1[10] * cospi_22_64
+    vmull.s16       q11, d20, d30
+    vmull.s16       q12, d21, d30
+
+    ; step1[10] * cospi_10_64
+    vmull.s16       q4, d20, d31
+    vmull.s16       q5, d21, d31
+
+    ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64
+    vmlsl.s16       q11, d26, d31
+    vmlsl.s16       q12, d27, d31
+
+    ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64
+    vmlal.s16       q4, d26, d30
+    vmlal.s16       q5, d27, d30
+
+    ; preloading to avoid stall
+    ; generate cospi_6_64 = 15679
+    mov             r3, #0x3d00
+    add             r3, #0x3f
+
+    ; generate cospi_26_64 = 4756
+    mov             r12, #0x1200
+    add             r12, #0x94
+
+    vdup.16         d30, r3                   ; duplicate cospi_6_64
+    vdup.16         d31, r12                  ; duplicate cospi_26_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q11, #14              ; >> 14
+    vqrshrn.s32     d5, q12, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d11, q5, #14              ; >> 14
+    vqrshrn.s32     d10, q4, #14              ; >> 14
+
+    ; step1[11] * cospi_6_64
+    vmull.s16       q10, d28, d30
+    vmull.s16       q11, d29, d30
+
+    ; step1[11] * cospi_26_64
+    vmull.s16       q12, d28, d31
+    vmull.s16       q13, d29, d31
+
+    ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64
+    vmlsl.s16       q10, d18, d31
+    vmlsl.s16       q11, d19, d31
+
+    ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64
+    vmlal.s16       q12, d18, d30
+    vmlal.s16       q13, d19, d30
+
+    vsub.s16        q9, q0, q1                ; step1[9]=step2[8]-step2[9]
+    vadd.s16        q0, q0, q1                ; step1[8]=step2[8]+step2[9]
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d6, q10, #14              ; >> 14
+    vqrshrn.s32     d7, q11, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d8, q12, #14              ; >> 14
+    vqrshrn.s32     d9, q13, #14              ; >> 14
+
+    ; stage 3
+    vsub.s16        q10, q3, q2               ; step1[10]=-step2[10]+step2[11]
+    vadd.s16        q11, q2, q3               ; step1[11]=step2[10]+step2[11]
+    vadd.s16        q12, q4, q5               ; step1[12]=step2[12]+step2[13]
+    vsub.s16        q13, q4, q5               ; step1[13]=step2[12]-step2[13]
+    vsub.s16        q14, q7, q6               ; step1[14]=-step2[14]+tep2[15]
+    vadd.s16        q7, q6, q7                ; step1[15]=step2[14]+step2[15]
+
+    ; stage 4
+    ; generate cospi_24_64 = 6270
+    mov             r3, #0x1800
+    add             r3, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r12, #0x3b00
+    add             r12, #0x21
+
+    ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+    vdup.16         d30, r12                  ; duplicate cospi_8_64
+    vdup.16         d31, r3                   ; duplicate cospi_24_64
+
+    ; step1[9] * cospi_24_64
+    vmull.s16       q2, d18, d31
+    vmull.s16       q3, d19, d31
+
+    ; step1[14] * cospi_24_64
+    vmull.s16       q4, d28, d31
+    vmull.s16       q5, d29, d31
+
+    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
+    vmlal.s16       q2, d28, d30
+    vmlal.s16       q3, d29, d30
+
+    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+    vmlsl.s16       q4, d18, d30
+    vmlsl.s16       q5, d19, d30
+
+    rsb             r12, #0
+    vdup.16         d30, r12                  ; duplicate -cospi_8_64
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q2, #14              ; >> 14
+    vqrshrn.s32     d13, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d2, q4, #14               ; >> 14
+    vqrshrn.s32     d3, q5, #14               ; >> 14
+
+    vmov.s16        q3, q11
+    vmov.s16        q4, q12
+
+    ; - step1[13] * cospi_8_64
+    vmull.s16       q11, d26, d30
+    vmull.s16       q12, d27, d30
+
+    ; -step1[10] * cospi_8_64
+    vmull.s16       q8, d20, d30
+    vmull.s16       q9, d21, d30
+
+    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+    vmlsl.s16       q11, d20, d31
+    vmlsl.s16       q12, d21, d31
+
+    ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+    vmlal.s16       q8, d26, d31
+    vmlal.s16       q9, d27, d31
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d4, q11, #14              ; >> 14
+    vqrshrn.s32     d5, q12, #14              ; >> 14
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d10, q8, #14              ; >> 14
+    vqrshrn.s32     d11, q9, #14              ; >> 14
+
+    ; stage 5
+    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
+    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
+    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
+    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
+    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
+    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
+    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
+    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
+
+    ; stage 6.
+    ; generate cospi_16_64 = 11585
+    mov             r12, #0x2d00
+    add             r12, #0x41
+
+    vdup.16         d14, r12                  ; duplicate cospi_16_64
+
+    ; step1[13] * cospi_16_64
+    vmull.s16       q3, d26, d14
+    vmull.s16       q4, d27, d14
+
+    ; step1[10] * cospi_16_64
+    vmull.s16       q0, d20, d14
+    vmull.s16       q1, d21, d14
+
+    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
+    vsub.s32        q5, q3, q0
+    vsub.s32        q6, q4, q1
+
+    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
+    vadd.s32        q10, q3, q0
+    vadd.s32        q4, q4, q1
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q5, #14               ; >> 14
+    vqrshrn.s32     d5, q6, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d10, q10, #14             ; >> 14
+    vqrshrn.s32     d11, q4, #14              ; >> 14
+
+    ; step1[11] * cospi_16_64
+    vmull.s16       q0, d22, d14
+    vmull.s16       q1, d23, d14
+
+    ; step1[12] * cospi_16_64
+    vmull.s16       q13, d24, d14
+    vmull.s16       q6, d25, d14
+
+    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
+    vsub.s32        q10, q13, q0
+    vsub.s32        q4, q6, q1
+
+    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
+    vadd.s32        q13, q13, q0
+    vadd.s32        q6, q6, q1
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d6, q10, #14              ; >> 14
+    vqrshrn.s32     d7, q4, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d8, q13, #14              ; >> 14
+    vqrshrn.s32     d9, q6, #14               ; >> 14
+
+    mov              r4, #16                  ; pass1Output stride
+    ldr              r3, [sp]                 ; load skip_adding
+    cmp              r3, #0                   ; check if need adding dest data
+    beq              skip_adding_dest
+
+    ldr              r7, [sp, #28]            ; dest used to save element 0-7
+    mov              r9, r7                   ; save dest pointer for later use
+    ldr              r8, [sp, #32]            ; load dest_stride
+
+    ; stage 7
+    ; load the data in pass1
+    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
+    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q12                  ; clip pixel
+    vqmovun.s16     d13, q13                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
+    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q12                  ; clip pixel
+    vqmovun.s16     d13, q13                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
+    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
+    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q12                  ; clip pixel
+    vqmovun.s16     d13, q13                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
+    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
+    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q12                  ; clip pixel
+    vqmovun.s16     d13, q13                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
+    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
+
+    ; store the data  output 8,9,10,11,12,13,14,15
+    vrshr.s16       q8, q8, #6                ; ROUND_POWER_OF_TWO
+    vaddw.u8        q8, q8, d12               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q8                   ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q9, q9, #6
+    vaddw.u8        q9, q9, d13               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d13, q9                   ; clip pixel
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q2, q2, #6
+    vaddw.u8        q2, q2, d12               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q2                   ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q3, q3, #6
+    vaddw.u8        q3, q3, d13               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d13, q3                   ; clip pixel
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q4, q4, #6
+    vaddw.u8        q4, q4, d12               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q4                   ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q5, q5, #6
+    vaddw.u8        q5, q5, d13               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d13, q5                   ; clip pixel
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q14, q14, #6
+    vaddw.u8        q14, q14, d12             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q14                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q15, q15, #6
+    vaddw.u8        q15, q15, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d13, q15                  ; clip pixel
+    vst1.64         {d13}, [r9], r8           ; store the data
+    b               end_idct16x16_pass2
+
+skip_adding_dest
+    ; stage 7
+    ; load the data in pass1
+    mov              r5, #24
+    mov              r3, #8
+
+    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
+    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
+    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
+    vst1.64         {d24}, [r1], r3           ; store output[0]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[1]
+    vst1.64         {d27}, [r1], r5
+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
+    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
+    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
+    vst1.64         {d24}, [r1], r3           ; store output[2]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[3]
+    vst1.64         {d27}, [r1], r5
+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
+    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
+    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
+    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
+    vst1.64         {d24}, [r1], r3           ; store output[4]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[5]
+    vst1.64         {d27}, [r1], r5
+    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
+    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
+    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
+    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
+    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
+    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
+    vst1.64         {d24}, [r1], r3           ; store output[6]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[7]
+    vst1.64         {d27}, [r1], r5
+
+    ; store the data  output 8,9,10,11,12,13,14,15
+    vst1.64         {d16}, [r1], r3
+    vst1.64         {d17}, [r1], r5
+    vst1.64         {d18}, [r1], r3
+    vst1.64         {d19}, [r1], r5
+    vst1.64         {d4}, [r1], r3
+    vst1.64         {d5}, [r1], r5
+    vst1.64         {d6}, [r1], r3
+    vst1.64         {d7}, [r1], r5
+    vst1.64         {d8}, [r1], r3
+    vst1.64         {d9}, [r1], r5
+    vst1.64         {d10}, [r1], r3
+    vst1.64         {d11}, [r1], r5
+    vst1.64         {d28}, [r1], r3
+    vst1.64         {d29}, [r1], r5
+    vst1.64         {d30}, [r1], r3
+    vst1.64         {d31}, [r1], r5
+end_idct16x16_pass2
+    pop             {r3-r9}
+    bx              lr
+    ENDP  ; |vp9_short_idct16x16_add_neon_pass2|
+
+;void |vp9_short_idct10_16x16_add_neon_pass1|(int16_t *input,
+;                                             int16_t *output, int output_stride)
+;
+; r0  int16_t input
+; r1  int16_t *output
+; r2  int  output_stride)
+
+; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vp9_short_idct10_16x16_add_neon_pass1| PROC
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q1,q2}, [r0]!
+    vmov.s16        q15, q1
+
+    ; generate  cospi_28_64*2 = 6392
+    mov             r3, #0x1800
+    add             r3, #0xf8
+
+    ; generate cospi_4_64*2  = 32138
+    mov             r12, #0x7d00
+    add             r12, #0x8a
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         q0, r3                    ; duplicate cospi_28_64*2
+    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
+
+    ; The following instructions use vqrdmulh to do the
+    ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply,
+    ; double, and return the high 16 bits, effectively giving >> 15. Doubling
+    ; the constant will change this to >> 14.
+    ; dct_const_round_shift(step2[4] * cospi_28_64);
+    vqrdmulh.s16    q4, q9, q0
+
+    ; preloading to avoid stall
+    ; generate cospi_16_64*2 = 23170
+    mov             r3, #0x5a00
+    add             r3, #0x82
+
+    ; dct_const_round_shift(step2[4] * cospi_4_64);
+    vqrdmulh.s16    q7, q9, q1
+
+    ; stage 4
+    vdup.16         q1, r3                    ; cospi_16_64*2
+
+    ; generate cospi_16_64 = 11585
+    mov             r3, #0x2d00
+    add             r3, #0x41
+
+    vdup.16         d4, r3;                   ; duplicate cospi_16_64
+
+    ; dct_const_round_shift(step1[0] * cospi_16_64)
+    vqrdmulh.s16    q8, q8, q1
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d14, d4
+    vmull.s16       q10, d15, d4
+
+    ; step2[5] * cospi_16_64
+    vmull.s16       q12, d9, d4
+    vmull.s16       q11, d8, d4
+
+    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
+    vsub.s32        q15, q10, q12
+    vsub.s32        q6, q9, q11
+
+    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
+    vadd.s32        q9, q9, q11
+    vadd.s32        q10, q10, q12
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d11, q15, #14             ; >> 14
+    vqrshrn.s32     d10, q6, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q10, #14             ; >> 14
+
+    ; stage 6
+    vadd.s16        q2, q8, q7                ; step2[0] = step1[0] + step1[7];
+    vadd.s16        q10, q8, q5               ; step2[2] = step1[2] + step1[5];
+    vadd.s16        q11, q8, q4               ; step2[3] = step1[3] + step1[4];
+    vadd.s16        q9, q8, q6                ; step2[1] = step1[1] + step1[6];
+    vsub.s16        q12, q8, q4               ; step2[4] = step1[3] - step1[4];
+    vsub.s16        q13, q8, q5               ; step2[5] = step1[2] - step1[5];
+    vsub.s16        q14, q8, q6               ; step2[6] = step1[1] - step1[6];
+    vsub.s16        q15, q8, q7               ; step2[7] = step1[0] - step1[7];
+
+    ; store the data
+    vst1.64         {d4}, [r1], r2
+    vst1.64         {d5}, [r1], r2
+    vst1.64         {d18}, [r1], r2
+    vst1.64         {d19}, [r1], r2
+    vst1.64         {d20}, [r1], r2
+    vst1.64         {d21}, [r1], r2
+    vst1.64         {d22}, [r1], r2
+    vst1.64         {d23}, [r1], r2
+    vst1.64         {d24}, [r1], r2
+    vst1.64         {d25}, [r1], r2
+    vst1.64         {d26}, [r1], r2
+    vst1.64         {d27}, [r1], r2
+    vst1.64         {d28}, [r1], r2
+    vst1.64         {d29}, [r1], r2
+    vst1.64         {d30}, [r1], r2
+    vst1.64         {d31}, [r1], r2
+
+    bx              lr
+    ENDP  ; |vp9_short_idct10_16x16_add_neon_pass1|
+
+;void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
+;                                           int16_t *output,
+;                                           int16_t *pass1Output,
+;                                           int16_t skip_adding,
+;                                           uint8_t *dest,
+;                                           int dest_stride)
+;
+; r0  int16_t *src
+; r1  int16_t *output,
+; r2  int16_t *pass1Output,
+; r3  int16_t skip_adding,
+; r4  uint8_t *dest,
+; r5  int dest_stride)
+
+; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vp9_short_idct10_16x16_add_neon_pass2| PROC
+    push            {r3-r9}
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q0,q1}, [r0]!
+    vmov.s16        q15, q0;
+
+    ; generate 2*cospi_30_64 = 3212
+    mov             r3, #0xc00
+    add             r3, #0x8c
+
+    ; generate 2*cospi_2_64  = 32610
+    mov             r12, #0x7f00
+    add             r12, #0x62
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         q6, r3                    ; duplicate 2*cospi_30_64
+
+    ; dct_const_round_shift(step1[8] * cospi_30_64)
+    vqrdmulh.s16    q0, q8, q6
+
+    vdup.16         q6, r12                   ; duplicate 2*cospi_2_64
+
+    ; dct_const_round_shift(step1[8] * cospi_2_64)
+    vqrdmulh.s16    q7, q8, q6
+
+    ; preloading to avoid stall
+    ; generate 2*cospi_26_64 = 9512
+    mov             r12, #0x2500
+    add             r12, #0x28
+    rsb             r12, #0
+    vdup.16         q15, r12                  ; duplicate -2*cospi_26_64
+
+    ; generate 2*cospi_6_64 = 31358
+    mov             r3, #0x7a00
+    add             r3, #0x7e
+    vdup.16         q14, r3                   ; duplicate 2*cospi_6_64
+
+    ; dct_const_round_shift(- step1[12] * cospi_26_64)
+    vqrdmulh.s16    q3, q9, q15
+
+    ; dct_const_round_shift(step1[12] * cospi_6_64)
+    vqrdmulh.s16    q4, q9, q14
+
+    ; stage 4
+    ; generate cospi_24_64 = 6270
+    mov             r3, #0x1800
+    add             r3, #0x7e
+    vdup.16         d31, r3                   ; duplicate cospi_24_64
+
+    ; generate cospi_8_64 = 15137
+    mov             r12, #0x3b00
+    add             r12, #0x21
+    vdup.16         d30, r12                  ; duplicate cospi_8_64
+
+    ; step1[14] * cospi_24_64
+    vmull.s16       q12, d14, d31
+    vmull.s16       q5, d15, d31
+
+    ; step1[9] * cospi_24_64
+    vmull.s16       q2, d0, d31
+    vmull.s16       q11, d1, d31
+
+    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+    vmlsl.s16       q12, d0, d30
+    vmlsl.s16       q5, d1, d30
+
+    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
+    vmlal.s16       q2, d14, d30
+    vmlal.s16       q11, d15, d30
+
+    rsb              r12, #0
+    vdup.16          d30, r12                 ; duplicate -cospi_8_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d2, q12, #14              ; >> 14
+    vqrshrn.s32     d3, q5, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q2, #14              ; >> 14
+    vqrshrn.s32     d13, q11, #14             ; >> 14
+
+    ; - step1[13] * cospi_8_64
+    vmull.s16       q10, d8, d30
+    vmull.s16       q13, d9, d30
+
+    ; -step1[10] * cospi_8_64
+    vmull.s16       q8, d6, d30
+    vmull.s16       q9, d7, d30
+
+    ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64
+    vmlsl.s16       q10, d6, d31
+    vmlsl.s16       q13, d7, d31
+
+    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+    vmlal.s16       q8, d8, d31
+    vmlal.s16       q9, d9, d31
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q10, #14              ; >> 14
+    vqrshrn.s32     d5, q13, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d10, q8, #14              ; >> 14
+    vqrshrn.s32     d11, q9, #14              ; >> 14
+
+    ; stage 5
+    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
+    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
+    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
+    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
+    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
+    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
+    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
+    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
+
+    ; stage 6.
+    ; generate cospi_16_64 = 11585
+    mov             r12, #0x2d00
+    add             r12, #0x41
+
+    vdup.16         d14, r12                  ; duplicate cospi_16_64
+
+    ; step1[13] * cospi_16_64
+    vmull.s16       q3, d26, d14
+    vmull.s16       q4, d27, d14
+
+    ; step1[10] * cospi_16_64
+    vmull.s16       q0, d20, d14
+    vmull.s16       q1, d21, d14
+
+    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
+    vsub.s32        q5, q3, q0
+    vsub.s32        q6, q4, q1
+
+    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
+    vadd.s32        q0, q3, q0
+    vadd.s32        q1, q4, q1
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q5, #14               ; >> 14
+    vqrshrn.s32     d5, q6, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d10, q0, #14              ; >> 14
+    vqrshrn.s32     d11, q1, #14              ; >> 14
+
+    ; step1[11] * cospi_16_64
+    vmull.s16       q0, d22, d14
+    vmull.s16       q1, d23, d14
+
+    ; step1[12] * cospi_16_64
+    vmull.s16       q13, d24, d14
+    vmull.s16       q6, d25, d14
+
+    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
+    vsub.s32        q10, q13, q0
+    vsub.s32        q4, q6, q1
+
+    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
+    vadd.s32        q13, q13, q0
+    vadd.s32        q6, q6, q1
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d6, q10, #14              ; >> 14
+    vqrshrn.s32     d7, q4, #14               ; >> 14
+
+    ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);
+    vqrshrn.s32     d8, q13, #14              ; >> 14
+    vqrshrn.s32     d9, q6, #14               ; >> 14
+
+    mov              r4, #16                  ; pass1Output stride
+    ldr              r3, [sp]                 ; load skip_adding
+
+    ; stage 7
+    ; load the data in pass1
+    mov              r5, #24
+    mov              r3, #8
+
+    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
+    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
+    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
+    vst1.64         {d24}, [r1], r3           ; store output[0]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[1]
+    vst1.64         {d27}, [r1], r5
+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
+    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
+    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
+    vst1.64         {d24}, [r1], r3           ; store output[2]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[3]
+    vst1.64         {d27}, [r1], r5
+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
+    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
+    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
+    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
+    vst1.64         {d24}, [r1], r3           ; store output[4]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[5]
+    vst1.64         {d27}, [r1], r5
+    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
+    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
+    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
+    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
+    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
+    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
+    vst1.64         {d24}, [r1], r3           ; store output[6]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[7]
+    vst1.64         {d27}, [r1], r5
+
+    ; store the data  output 8,9,10,11,12,13,14,15
+    vst1.64         {d16}, [r1], r3
+    vst1.64         {d17}, [r1], r5
+    vst1.64         {d18}, [r1], r3
+    vst1.64         {d19}, [r1], r5
+    vst1.64         {d4}, [r1], r3
+    vst1.64         {d5}, [r1], r5
+    vst1.64         {d6}, [r1], r3
+    vst1.64         {d7}, [r1], r5
+    vst1.64         {d8}, [r1], r3
+    vst1.64         {d9}, [r1], r5
+    vst1.64         {d10}, [r1], r3
+    vst1.64         {d11}, [r1], r5
+    vst1.64         {d28}, [r1], r3
+    vst1.64         {d29}, [r1], r5
+    vst1.64         {d30}, [r1], r3
+    vst1.64         {d31}, [r1], r5
+end_idct10_16x16_pass2
+    pop             {r3-r9}
+    bx              lr
+    ENDP  ; |vp9_short_idct10_16x16_add_neon_pass2|
+;void |save_neon_registers|()
+|save_neon_registers| PROC
+    vpush           {d8-d15}
+    bx              lr
+    ENDP  ; |save_registers|
+;void |restore_neon_registers|()
+|restore_neon_registers| PROC
+    vpop           {d8-d15}
+    bx             lr
+    ENDP  ; |restore_registers|
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
new file mode 100644
index 0000000..5c097cc
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
@@ -0,0 +1,1013 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+;TODO(cd): adjust these constant to be able to use vqdmulh for faster
+;          dct_const_round_shift(a * b) within butterfly calculations.
+cospi_1_64  EQU 16364
+cospi_2_64  EQU 16305
+cospi_3_64  EQU 16207
+cospi_4_64  EQU 16069
+cospi_5_64  EQU 15893
+cospi_6_64  EQU 15679
+cospi_7_64  EQU 15426
+cospi_8_64  EQU 15137
+cospi_9_64  EQU 14811
+cospi_10_64 EQU 14449
+cospi_11_64 EQU 14053
+cospi_12_64 EQU 13623
+cospi_13_64 EQU 13160
+cospi_14_64 EQU 12665
+cospi_15_64 EQU 12140
+cospi_16_64 EQU 11585
+cospi_17_64 EQU 11003
+cospi_18_64 EQU 10394
+cospi_19_64 EQU  9760
+cospi_20_64 EQU  9102
+cospi_21_64 EQU  8423
+cospi_22_64 EQU  7723
+cospi_23_64 EQU  7005
+cospi_24_64 EQU  6270
+cospi_25_64 EQU  5520
+cospi_26_64 EQU  4756
+cospi_27_64 EQU  3981
+cospi_28_64 EQU  3196
+cospi_29_64 EQU  2404
+cospi_30_64 EQU  1606
+cospi_31_64 EQU   804
+
+
+    EXPORT  |idct32_transpose_and_transform|
+    EXPORT  |idct32_combine_add|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    AREA     Block, CODE, READONLY
+
+    ; --------------------------------------------------------------------------
+    ; Load from transposed_buffer
+    ;   q13 = transposed_buffer[first_offset]
+    ;   q14 = transposed_buffer[second_offset]
+    ;   for proper address calculation, the last offset used when manipulating
+    ;   transposed_buffer must be passed in. use 0 for first use.
+    MACRO
+    LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
+    ; address calculation with proper stride and loading
+    add r0, #($first_offset  - $prev_offset )*8*2
+    vld1.s16        {q14}, [r0]
+    add r0, #($second_offset - $first_offset)*8*2
+    vld1.s16        {q13}, [r0]
+    ; (used) two registers (q14, q13)
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Load from output (used as temporary storage)
+    ;   reg1 = output[first_offset]
+    ;   reg2 = output[second_offset]
+    ;   for proper address calculation, the last offset used when manipulating
+    ;   output, wethere reading or storing) must be passed in. use 0 for first
+    ;   use.
+    MACRO
+    LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+    ; address calculation with proper stride and loading
+    add r1, #($first_offset  - $prev_offset )*32*2
+    vld1.s16        {$reg1}, [r1]
+    add r1, #($second_offset - $first_offset)*32*2
+    vld1.s16        {$reg2}, [r1]
+    ; (used) two registers ($reg1, $reg2)
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Store into output (sometimes as as temporary storage)
+    ;   output[first_offset] = reg1
+    ;   output[second_offset] = reg2
+    ;   for proper address calculation, the last offset used when manipulating
+    ;   output, wethere reading or storing) must be passed in. use 0 for first
+    ;   use.
+    MACRO
+    STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+    ; address calculation with proper stride and storing
+    add r1, #($first_offset  - $prev_offset )*32*2
+    vst1.16 {$reg1}, [r1]
+    add r1, #($second_offset - $first_offset)*32*2
+    vst1.16 {$reg2}, [r1]
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Touches q8-q12, q15 (q13-q14 are preserved)
+    ; valid output registers are anything but q8-q11
+    MACRO
+    DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+    ; TODO(cd): have special case to re-use constants when they are similar for
+    ;           consecutive butterflies
+    ; TODO(cd): have special case when both constants are the same, do the
+    ;           additions/substractions before the multiplies.
+    ; generate the constants
+    ;   generate scalar constants
+    mov             r3,  #$first_constant  & 0xFF00
+    add             r3,  #$first_constant  & 0x00FF
+    mov             r12, #$second_constant & 0xFF00
+    add             r12, #$second_constant & 0x00FF
+    ;   generate vector constants
+    vdup.16         d30, r3
+    vdup.16         d31, r12
+    ; (used) two for inputs (regA-regD), one for constants (q15)
+    ; do some multiplications (ordered for maximum latency hiding)
+    vmull.s16 q8,  $regC, d30
+    vmull.s16 q10, $regA, d31
+    vmull.s16 q9,  $regD, d30
+    vmull.s16 q11, $regB, d31
+    vmull.s16 q12, $regC, d31
+    ; (used) five for intermediate (q8-q12), one for constants (q15)
+    ; do some addition/substractions (to get back two register)
+    vsub.s32  q8, q8, q10
+    vsub.s32  q9, q9, q11
+    ; do more multiplications (ordered for maximum latency hiding)
+    vmull.s16 q10, $regD, d31
+    vmull.s16 q11, $regA, d30
+    vmull.s16 q15, $regB, d30
+    ; (used) six for intermediate (q8-q12, q15)
+    ; do more addition/substractions
+    vadd.s32  q11, q12, q11
+    vadd.s32  q10, q10, q15
+    ; (used) four for intermediate (q8-q11)
+    ; dct_const_round_shift
+    vqrshrn.s32 $reg1, q8,  #14
+    vqrshrn.s32 $reg2, q9,  #14
+    vqrshrn.s32 $reg3, q11, #14
+    vqrshrn.s32 $reg4, q10, #14
+    ; (used) two for results, well four d registers
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Touches q8-q12, q15 (q13-q14 are preserved)
+    ; valid output registers are anything but q8-q11
+    MACRO
+    DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+    DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+    MEND
+    ; --------------------------------------------------------------------------
+
+;void idct32_transpose_and_transform(int16_t *transpose_buffer, int16_t *output, int16_t *input);
+;
+; r0  int16_t *transpose_buffer
+; r1  int16_t *output
+; r2  int16_t *input)
+; TODO(cd): have more logical parameter ordering but this issue will disappear
+;           when functions are combined.
+
+|idct32_transpose_and_transform| PROC
+    ; This function does one pass of idct32x32 transform.
+    ;
+    ; This is done by transposing the input and then doing a 1d transform on
+    ; columns. In the first pass, the transposed columns are the original
+    ; rows. In the second pass, after the transposition, the colums are the
+    ; original columns.
+    ; The 1d transform is done by looping over bands of eight columns (the
+    ; idct32_bands loop). For each band, the transform input transposition
+    ; is done on demand, one band of four 8x8 matrices at a time. The four
+    ; matrices are trsnposed by pairs (the idct32_transpose_pair loop).
+    push {r4}
+    mov r4, #0          ; initialize bands loop counter
+idct32_bands_loop
+    ; TODO(cd) get rid of these push/pop by properly adjusting register
+    ;          content at end of loop
+    push {r0}
+    push {r1}
+    push {r2}
+    mov r3, #0          ; initialize transpose loop counter
+idct32_transpose_pair_loop
+    ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
+    ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
+    ; adjusted to 32 because of the two post-increments.
+    vld1.s16        {q8},  [r2]!
+    vld1.s16        {q0},  [r2]!
+    add r2, #32
+    vld1.s16        {q9},  [r2]!
+    vld1.s16        {q1},  [r2]!
+    add r2, #32
+    vld1.s16        {q10}, [r2]!
+    vld1.s16        {q2},  [r2]!
+    add r2, #32
+    vld1.s16        {q11}, [r2]!
+    vld1.s16        {q3},  [r2]!
+    add r2, #32
+    vld1.s16        {q12}, [r2]!
+    vld1.s16        {q4},  [r2]!
+    add r2, #32
+    vld1.s16        {q13}, [r2]!
+    vld1.s16        {q5},  [r2]!
+    add r2, #32
+    vld1.s16        {q14}, [r2]!
+    vld1.s16        {q6},  [r2]!
+    add r2, #32
+    vld1.s16        {q15}, [r2]!
+    vld1.s16        {q7},  [r2]!
+
+    ; Transpose the two 8x8 16bit data matrices.
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vswp            d1,  d8
+    vswp            d7,  d14
+    vswp            d5,  d12
+    vswp            d3,  d10
+    vtrn.32         q8,  q10
+    vtrn.32         q9,  q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.32         q0,  q2
+    vtrn.32         q1,  q3
+    vtrn.32         q4,  q6
+    vtrn.32         q5,  q7
+    vtrn.16         q8,  q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    vtrn.16         q0,  q1
+    vtrn.16         q2,  q3
+    vtrn.16         q4,  q5
+    vtrn.16         q6,  q7
+
+    ; Store both matrices after each other. There is a stride of 32, which
+    ; adjusts to nothing because of the post-increments.
+    vst1.16        {q8},  [r0]!
+    vst1.16        {q9},  [r0]!
+    vst1.16        {q10}, [r0]!
+    vst1.16        {q11}, [r0]!
+    vst1.16        {q12}, [r0]!
+    vst1.16        {q13}, [r0]!
+    vst1.16        {q14}, [r0]!
+    vst1.16        {q15}, [r0]!
+    vst1.16        {q0},  [r0]!
+    vst1.16        {q1},  [r0]!
+    vst1.16        {q2},  [r0]!
+    vst1.16        {q3},  [r0]!
+    vst1.16        {q4},  [r0]!
+    vst1.16        {q5},  [r0]!
+    vst1.16        {q6},  [r0]!
+    vst1.16        {q7},  [r0]!
+
+    ; increment pointers by adjusted stride (not necessary for r0/out)
+    sub r2,  r2,  #8*32*2-32-16*2
+    ; transpose pair loop processing
+    add r3, r3, #1
+    cmp r3, #1
+    BLE idct32_transpose_pair_loop
+
+    ; restore r0/input to its original value
+    sub r0, r0, #32*8*2
+
+    ; Instead of doing the transforms stage by stage, it is done by loading
+    ; some input values and doing as many stages as possible to minimize the
+    ; storing/loading of intermediate results. To fit within registers, the
+    ; final coefficients are cut into four blocks:
+    ; BLOCK A: 16-19,28-31
+    ; BLOCK B: 20-23,24-27
+    ; BLOCK C: 8-10,11-15
+    ; BLOCK D: 0-3,4-7
+    ; Blocks A and C are straight calculation through the various stages. In
+    ; block B, further calculations are performed using the results from
+    ; block A. In block D, further calculations are performed using the results
+    ; from block C and then the final calculations are done using results from
+    ; block A and B which have been combined at the end of block B.
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK A: 16-19,28-31
+    ; --------------------------------------------------------------------------
+    ; generate 16,17,30,31
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] *  cospi_1_64;
+    ;temp2 = input[1 * 32] *  cospi_1_64 + input[31 * 32] * cospi_31_64;
+    ;step1b[16][i] = dct_const_round_shift(temp1);
+    ;step1b[31][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 0, 1, 31
+    DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
+    ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
+    ;step1b[17][i] = dct_const_round_shift(temp1);
+    ;step1b[30][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 31, 17, 15
+    DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[16] =  step1b[16][i] + step1b[17][i];
+    ;step2[17] =  step1b[16][i] - step1b[17][i];
+    ;step2[30] = -step1b[30][i] + step1b[31][i];
+    ;step2[31] =  step1b[30][i] + step1b[31][i];
+    vadd.s16  q4, q0, q1
+    vsub.s16  q13, q0, q1
+    vadd.s16  q6, q2, q3
+    vsub.s16  q14, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
+    ;temp2 = step1b[30][i] * cospi_4_64  - step1b[17][i] * cospi_28_64;
+    ;step3[17] = dct_const_round_shift(temp1);
+    ;step3[30] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; generate 18,19,28,29
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
+    ;temp2 = input[9 * 32] *  cospi_9_64 + input[23 * 32] * cospi_23_64;
+    ;step1b[18][i] = dct_const_round_shift(temp1);
+    ;step1b[29][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 15, 9, 23
+    DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[25 * 32] *  cospi_7_64 - input[7 * 32] * cospi_25_64;
+    ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
+    ;step1b[19][i] = dct_const_round_shift(temp1);
+    ;step1b[28][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 23, 25, 7
+    DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[18] = -step1b[18][i] + step1b[19][i];
+    ;step2[19] =  step1b[18][i] + step1b[19][i];
+    ;step2[28] =  step1b[28][i] + step1b[29][i];
+    ;step2[29] =  step1b[28][i] - step1b[29][i];
+    vsub.s16  q13, q3, q2
+    vadd.s16  q3,  q3, q2
+    vsub.s16  q14, q1, q0
+    vadd.s16  q2,  q1, q0
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[18][i] * (-cospi_4_64)  - step1b[29][i] * (-cospi_28_64);
+    ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
+    ;step3[29] = dct_const_round_shift(temp1);
+    ;step3[18] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
+    ; --------------------------------------------------------------------------
+    ; combine 16-19,28-31
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[16] = step1b[16][i] + step1b[19][i];
+    ;step1[17] = step1b[17][i] + step1b[18][i];
+    ;step1[18] = step1b[17][i] - step1b[18][i];
+    ;step1[29] = step1b[30][i] - step1b[29][i];
+    ;step1[30] = step1b[30][i] + step1b[29][i];
+    ;step1[31] = step1b[31][i] + step1b[28][i];
+    vadd.s16  q8,  q4, q2
+    vadd.s16  q9,  q5, q0
+    vadd.s16  q10, q7, q1
+    vadd.s16  q15, q6, q3
+    vsub.s16  q13, q5, q0
+    vsub.s16  q14, q7, q1
+    STORE_IN_OUTPUT 0,  16, 31, q8,  q15
+    STORE_IN_OUTPUT 31, 17, 30, q9,  q10
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
+    ;temp2 = step1b[29][i] * cospi_8_64  + step1b[18][i] * cospi_24_64;
+    ;step2[18] = dct_const_round_shift(temp1);
+    ;step2[29] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
+    STORE_IN_OUTPUT 30, 29, 18, q1, q0
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[19] = step1b[16][i] - step1b[19][i];
+    ;step1[28] = step1b[31][i] - step1b[28][i];
+    vsub.s16  q13, q4, q2
+    vsub.s16  q14, q6, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
+    ;temp2 = step1b[28][i] * cospi_8_64  + step1b[19][i] * cospi_24_64;
+    ;step2[19] = dct_const_round_shift(temp1);
+    ;step2[28] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
+    STORE_IN_OUTPUT 18, 19, 28, q4, q6
+    ; --------------------------------------------------------------------------
+
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK B: 20-23,24-27
+    ; --------------------------------------------------------------------------
+    ; generate 20,21,26,27
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
+    ;temp2 = input[5 * 32] *  cospi_5_64 + input[27 * 32] * cospi_27_64;
+    ;step1b[20][i] = dct_const_round_shift(temp1);
+    ;step1b[27][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 7, 5, 27
+    DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
+    ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
+    ;step1b[21][i] = dct_const_round_shift(temp1);
+    ;step1b[26][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 27, 21, 11
+    DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[20] =  step1b[20][i] + step1b[21][i];
+    ;step2[21] =  step1b[20][i] - step1b[21][i];
+    ;step2[26] = -step1b[26][i] + step1b[27][i];
+    ;step2[27] =  step1b[26][i] + step1b[27][i];
+    vsub.s16  q13, q0, q1
+    vadd.s16  q0, q0, q1
+    vsub.s16  q14, q2, q3
+    vadd.s16  q2, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
+    ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
+    ;step3[21] = dct_const_round_shift(temp1);
+    ;step3[26] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; generate 22,23,24,25
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
+    ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
+    ;step1b[22][i] = dct_const_round_shift(temp1);
+    ;step1b[25][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 11, 13, 19
+    DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[29 * 32] *  cospi_3_64 - input[3 * 32] * cospi_29_64;
+    ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
+    ;step1b[23][i] = dct_const_round_shift(temp1);
+    ;step1b[24][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 19, 29, 3
+    DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[22] = -step1b[22][i] + step1b[23][i];
+    ;step2[23] =  step1b[22][i] + step1b[23][i];
+    ;step2[24] =  step1b[24][i] + step1b[25][i];
+    ;step2[25] =  step1b[24][i] - step1b[25][i];
+    vsub.s16  q14, q4, q5
+    vadd.s16  q5, q4, q5
+    vsub.s16  q13, q6, q7
+    vadd.s16  q6, q6, q7
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
+    ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
+    ;step3[25] = dct_const_round_shift(temp1);
+    ;step3[22] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
+    ; --------------------------------------------------------------------------
+    ; combine 20-23,24-27
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[22] = step1b[22][i] + step1b[21][i];
+    ;step1[23] = step1b[23][i] + step1b[20][i];
+    vadd.s16  q10, q7, q1
+    vadd.s16  q11, q5, q0
+    ;step1[24] = step1b[24][i] + step1b[27][i];
+    ;step1[25] = step1b[25][i] + step1b[26][i];
+    vadd.s16  q12, q6, q2
+    vadd.s16  q15, q4, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[16] = step1b[16][i] + step1b[23][i];
+    ;step3[17] = step1b[17][i] + step1b[22][i];
+    ;step3[22] = step1b[17][i] - step1b[22][i];
+    ;step3[23] = step1b[16][i] - step1b[23][i];
+    LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
+    vadd.s16  q8,  q14, q11
+    vadd.s16  q9,  q13, q10
+    vsub.s16  q13, q13, q10
+    vsub.s16  q11, q14, q11
+    STORE_IN_OUTPUT 17, 17, 16, q9, q8
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[24] = step1b[31][i] - step1b[24][i];
+    ;step3[25] = step1b[30][i] - step1b[25][i];
+    ;step3[30] = step1b[30][i] + step1b[25][i];
+    ;step3[31] = step1b[31][i] + step1b[24][i];
+    LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
+    vsub.s16  q8,  q9,  q12
+    vadd.s16  q10, q14, q15
+    vsub.s16  q14, q14, q15
+    vadd.s16  q12, q9,  q12
+    STORE_IN_OUTPUT 31, 30, 31, q10, q12
+    ; --------------------------------------------------------------------------
+    ; TODO(cd) do some register allocation change to remove these push/pop
+    vpush {q8}  ; [24]
+    vpush {q11} ; [23]
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
+    ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
+    ;step1[22] = dct_const_round_shift(temp1);
+    ;step1[25] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+    STORE_IN_OUTPUT 31, 25, 22, q14, q13
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
+    ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
+    ;step1[23] = dct_const_round_shift(temp1);
+    ;step1[24] = dct_const_round_shift(temp2);
+    ; TODO(cd) do some register allocation change to remove these push/pop
+    vpop  {q13} ; [23]
+    vpop  {q14} ; [24]
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+    STORE_IN_OUTPUT 22, 24, 23, q14, q13
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[20] = step1b[23][i] - step1b[20][i];
+    ;step1[27] = step1b[24][i] - step1b[27][i];
+    vsub.s16  q14, q5, q0
+    vsub.s16  q13, q6, q2
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[20][i] * (-cospi_8_64)  - step1b[27][i] * (-cospi_24_64);
+    ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
+    ;step2[27] = dct_const_round_shift(temp1);
+    ;step2[20] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[21] = step1b[22][i] - step1b[21][i];
+    ;step1[26] = step1b[25][i] - step1b[26][i];
+    vsub.s16  q14,  q7, q1
+    vsub.s16  q13,  q4, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[21][i] * (-cospi_8_64)  - step1b[26][i] * (-cospi_24_64);
+    ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
+    ;step2[26] = dct_const_round_shift(temp1);
+    ;step2[21] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[18] = step1b[18][i] + step1b[21][i];
+    ;step3[19] = step1b[19][i] + step1b[20][i];
+    ;step3[20] = step1b[19][i] - step1b[20][i];
+    ;step3[21] = step1b[18][i] - step1b[21][i];
+    LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
+    vadd.s16  q8,  q14, q1
+    vadd.s16  q9,  q13, q6
+    vsub.s16  q13, q13, q6
+    vsub.s16  q1,  q14, q1
+    STORE_IN_OUTPUT 19, 18, 19, q8, q9
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[27] = step1b[28][i] - step1b[27][i];
+    ;step3[28] = step1b[28][i] + step1b[27][i];
+    ;step3[29] = step1b[29][i] + step1b[26][i];
+    ;step3[26] = step1b[29][i] - step1b[26][i];
+    LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
+    vsub.s16  q14, q8, q5
+    vadd.s16  q10, q8, q5
+    vadd.s16  q11, q9, q0
+    vsub.s16  q0, q9, q0
+    STORE_IN_OUTPUT 29, 28, 29, q10, q11
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
+    ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
+    ;step1[20] = dct_const_round_shift(temp1);
+    ;step1[27] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+    STORE_IN_OUTPUT 29, 20, 27, q13, q14
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
+    ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
+    ;step1[21] = dct_const_round_shift(temp1);
+    ;step1[26] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
+    STORE_IN_OUTPUT 27, 21, 26, q1, q0
+    ; --------------------------------------------------------------------------
+
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK C: 8-10,11-15
+    ; --------------------------------------------------------------------------
+    ; generate 8,9,14,15
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
+    ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
+    ;step2[8] = dct_const_round_shift(temp1);
+    ;step2[15] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 3, 2, 30
+    DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
+    ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
+    ;step2[9] = dct_const_round_shift(temp1);
+    ;step2[14] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 30, 18, 14
+    DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;step3[8] = step1b[8][i] + step1b[9][i];
+    ;step3[9] = step1b[8][i] - step1b[9][i];
+    ;step3[14] = step1b[15][i] - step1b[14][i];
+    ;step3[15] = step1b[15][i] + step1b[14][i];
+    vsub.s16  q13, q0, q1
+    vadd.s16  q0, q0, q1
+    vsub.s16  q14, q2, q3
+    vadd.s16  q2, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
+    ;temp2 = step1b[14][i] * cospi_8_64  + step1b[9][i] * cospi_24_64;
+    ;step1[9]  = dct_const_round_shift(temp1);
+    ;step1[14] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; generate 10,11,12,13
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
+    ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
+    ;step2[10] = dct_const_round_shift(temp1);
+    ;step2[13] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 14, 10, 22
+    DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
+    ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
+    ;step2[11] = dct_const_round_shift(temp1);
+    ;step2[12] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 22, 26, 6
+    DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;step3[10] = step1b[11][i] - step1b[10][i];
+    ;step3[11] = step1b[11][i] + step1b[10][i];
+    ;step3[12] = step1b[12][i] + step1b[13][i];
+    ;step3[13] = step1b[12][i] - step1b[13][i];
+    vsub.s16  q14, q4, q5
+    vadd.s16  q5, q4, q5
+    vsub.s16  q13, q6, q7
+    vadd.s16  q6, q6, q7
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = step1b[10][i] * (-cospi_8_64)  - step1b[13][i] * (-cospi_24_64);
+    ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
+    ;step1[13] = dct_const_round_shift(temp1);
+    ;step1[10] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
+    ; --------------------------------------------------------------------------
+    ; combine 8-10,11-15
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;step2[8]  = step1b[8][i] + step1b[11][i];
+    ;step2[9]  = step1b[9][i] + step1b[10][i];
+    ;step2[10] = step1b[9][i] - step1b[10][i];
+    vadd.s16  q8,  q0, q5
+    vadd.s16  q9,  q1, q7
+    vsub.s16  q13, q1, q7
+    ;step2[13] = step1b[14][i] - step1b[13][i];
+    ;step2[14] = step1b[14][i] + step1b[13][i];
+    ;step2[15] = step1b[15][i] + step1b[12][i];
+    vsub.s16  q14, q3, q4
+    vadd.s16  q10, q3, q4
+    vadd.s16  q15, q2, q6
+    STORE_IN_OUTPUT 26, 8, 15, q8, q15
+    STORE_IN_OUTPUT 15, 9, 14, q9, q10
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
+    ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
+    ;step3[10] = dct_const_round_shift(temp1);
+    ;step3[13] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+    STORE_IN_OUTPUT 14, 13, 10, q3, q1
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;step2[11] = step1b[8][i] - step1b[11][i];
+    ;step2[12] = step1b[15][i] - step1b[12][i];
+    vsub.s16  q13, q0, q5
+    vsub.s16  q14,  q2, q6
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
+    ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
+    ;step3[11] = dct_const_round_shift(temp1);
+    ;step3[12] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+    STORE_IN_OUTPUT 10, 11, 12, q1, q3
+    ; --------------------------------------------------------------------------
+
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK D: 0-3,4-7
+    ; --------------------------------------------------------------------------
+    ; generate 4,5,6,7
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
+    ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
+    ;step3[4] = dct_const_round_shift(temp1);
+    ;step3[7] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 6, 4, 28
+    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
+    ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
+    ;step3[5] = dct_const_round_shift(temp1);
+    ;step3[6] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 28, 20, 12
+    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[4] = step1b[4][i] + step1b[5][i];
+    ;step1[5] = step1b[4][i] - step1b[5][i];
+    ;step1[6] = step1b[7][i] - step1b[6][i];
+    ;step1[7] = step1b[7][i] + step1b[6][i];
+    vsub.s16  q13, q0, q1
+    vadd.s16  q0, q0, q1
+    vsub.s16  q14, q2, q3
+    vadd.s16  q2, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
+    ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
+    ;step2[5] = dct_const_round_shift(temp1);
+    ;step2[6] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; generate 0,1,2,3
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
+    ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
+    ;step1[1] = dct_const_round_shift(temp1);
+    ;step1[0] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 12, 0, 16
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
+    ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
+    ;step1[2] = dct_const_round_shift(temp1);
+    ;step1[3] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 16, 8, 24
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;step2[0] = step1b[0][i] + step1b[3][i];
+    ;step2[1] = step1b[1][i] + step1b[2][i];
+    ;step2[2] = step1b[1][i] - step1b[2][i];
+    ;step2[3] = step1b[0][i] - step1b[3][i];
+    vadd.s16  q4, q7, q6
+    vsub.s16  q7, q7, q6
+    vsub.s16  q6, q5, q14
+    vadd.s16  q5, q5, q14
+    ; --------------------------------------------------------------------------
+    ; combine 0-3,4-7
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[0] = step1b[0][i] + step1b[7][i];
+    ;step3[1] = step1b[1][i] + step1b[6][i];
+    ;step3[2] = step1b[2][i] + step1b[5][i];
+    ;step3[3] = step1b[3][i] + step1b[4][i];
+    vadd.s16  q8,  q4, q2
+    vadd.s16  q9,  q5, q3
+    vadd.s16  q10, q6, q1
+    vadd.s16  q11, q7, q0
+    ;step3[4] = step1b[3][i] - step1b[4][i];
+    ;step3[5] = step1b[2][i] - step1b[5][i];
+    ;step3[6] = step1b[1][i] - step1b[6][i];
+    ;step3[7] = step1b[0][i] - step1b[7][i];
+    vsub.s16  q12, q7, q0
+    vsub.s16  q13, q6, q1
+    vsub.s16  q14, q5, q3
+    vsub.s16  q15, q4, q2
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[0] = step1b[0][i] + step1b[15][i];
+    ;step1[1] = step1b[1][i] + step1b[14][i];
+    ;step1[14] = step1b[1][i] - step1b[14][i];
+    ;step1[15] = step1b[0][i] - step1b[15][i];
+    LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
+    vadd.s16  q2, q8, q1
+    vadd.s16  q3, q9, q0
+    vsub.s16  q4, q9, q0
+    vsub.s16  q5, q8, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[14 * 32] = step1b[14][i] + step1b[17][i];
+    ;output[15 * 32] = step1b[15][i] + step1b[16][i];
+    ;output[16 * 32] = step1b[15][i] - step1b[16][i];
+    ;output[17 * 32] = step1b[14][i] - step1b[17][i];
+    LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 17, 17, 16, q7, q6
+    STORE_IN_OUTPUT 16, 15, 14, q9, q8
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+    LOAD_FROM_OUTPUT 14, 30, 31, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 31, 31, 30, q7, q6
+    STORE_IN_OUTPUT 30,  0,  1, q4, q5
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[2] = step1b[2][i] + step1b[13][i];
+    ;step1[3] = step1b[3][i] + step1b[12][i];
+    ;step1[12] = step1b[3][i] - step1b[12][i];
+    ;step1[13] = step1b[2][i] - step1b[13][i];
+    LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
+    vadd.s16  q2, q10, q1
+    vadd.s16  q3, q11, q0
+    vsub.s16  q4, q11, q0
+    vsub.s16  q5, q10, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+    vadd.s16  q6, q4, q1
+    vadd.s16  q7, q5, q0
+    vsub.s16  q8, q5, q0
+    vsub.s16  q9, q4, q1
+    STORE_IN_OUTPUT 19, 19, 18, q9, q8
+    STORE_IN_OUTPUT 18, 13, 12, q7, q6
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+    LOAD_FROM_OUTPUT 12, 28, 29, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 29, 29, 28, q7, q6
+    STORE_IN_OUTPUT 28,  2,  3, q4, q5
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[4] = step1b[4][i] + step1b[11][i];
+    ;step1[5] = step1b[5][i] + step1b[10][i];
+    ;step1[10] = step1b[5][i] - step1b[10][i];
+    ;step1[11] = step1b[4][i] - step1b[11][i];
+    LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
+    vadd.s16  q2, q12, q1
+    vadd.s16  q3, q13, q0
+    vsub.s16  q4, q13, q0
+    vsub.s16  q5, q12, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+    vadd.s16  q6, q4, q1
+    vadd.s16  q7, q5, q0
+    vsub.s16  q8, q5, q0
+    vsub.s16  q9, q4, q1
+    STORE_IN_OUTPUT 21, 21, 20, q9, q8
+    STORE_IN_OUTPUT 20, 11, 10, q7, q6
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+    LOAD_FROM_OUTPUT 10, 26, 27, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 27, 27, 26, q7, q6
+    STORE_IN_OUTPUT 26,  4,  5, q4, q5
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[6] = step1b[6][i] + step1b[9][i];
+    ;step1[7] = step1b[7][i] + step1b[8][i];
+    ;step1[8] = step1b[7][i] - step1b[8][i];
+    ;step1[9] = step1b[6][i] - step1b[9][i];
+    LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
+    vadd.s16  q2, q14, q1
+    vadd.s16  q3, q15, q0
+    vsub.s16  q4, q15, q0
+    vsub.s16  q5, q14, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+    vadd.s16  q6, q4, q1
+    vadd.s16  q7, q5, q0
+    vsub.s16  q8, q5, q0
+    vsub.s16  q9, q4, q1
+    STORE_IN_OUTPUT 23, 23, 22, q9, q8
+    STORE_IN_OUTPUT 22, 9, 8, q7, q6
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+    LOAD_FROM_OUTPUT 8, 24, 25, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 25, 25, 24, q7, q6
+    STORE_IN_OUTPUT 24,  6,  7, q4, q5
+    ; --------------------------------------------------------------------------
+
+    ; TODO(cd) get rid of these push/pop by properly adjusting register
+    ;          content at end of loop
+    pop {r2}
+    pop {r1}
+    pop {r0}
+    add r1, r1, #8*2
+    add r2, r2, #8*32*2
+
+    ; bands loop processing
+    add r4, r4, #1
+    cmp r4, #3
+    BLE idct32_bands_loop
+
+    pop {r4}
+    bx              lr
+    ENDP  ; |idct32_transpose_and_transform|
+
+;void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
+;
+; r0  uint8_t *dest
+; r1  int16_t *out
+; r2  int dest_stride)
+
+|idct32_combine_add| PROC
+
+    mov r12, r0         ; dest pointer used for stores
+    sub r2, r2, #32     ; adjust the stride (remove the post-increments)
+    mov r3, #0          ; initialize loop counter
+
+idct32_combine_add_loop
+    ; load out[j * 32 + 0-31]
+    vld1.s16        {q12},  [r1]!
+    vld1.s16        {q13},  [r1]!
+    vld1.s16        {q14},  [r1]!
+    vld1.s16        {q15},  [r1]!
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {q6}, [r0]!
+    vld1.s16        {q7}, [r0]!
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q12, q12, #6
+    vrshr.s16       q13, q13, #6
+    vrshr.s16       q14, q14, #6
+    vrshr.s16       q15, q15, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q12, q12, d12
+    vaddw.u8        q13, q13, d13
+    vaddw.u8        q14, q14, d14
+    vaddw.u8        q15, q15, d15
+    ; clip pixel
+    vqmovun.s16     d12, q12
+    vqmovun.s16     d13, q13
+    vqmovun.s16     d14, q14
+    vqmovun.s16     d15, q15
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {q6}, [r12]!
+    vst1.16         {q7}, [r12]!
+    ; increment pointers by adjusted stride (not necessary for r1/out)
+    add r0,  r0,  r2
+    add r12, r12, r2
+    ; loop processing
+    add r3, r3, #1
+    cmp r3, #31
+    BLE idct32_combine_add_loop
+
+    bx              lr
+    ENDP  ; |idct32_transpose|
+
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
new file mode 100644
index 0000000..869ee5f
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
@@ -0,0 +1,68 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct4x4_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct4x4_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 4)
+    add              r0, r0, #8                ; + (1 <<((4) - 1))
+    asr              r0, r0, #4                ; >> 4
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    vld1.32          {d2[0]}, [r1], r2
+    vld1.32          {d2[1]}, [r1], r2
+    vld1.32          {d4[0]}, [r1], r2
+    vld1.32          {d4[1]}, [r1]
+
+    vaddw.u8         q8, q0, d2                ; dest[x] + a1
+    vaddw.u8         q9, q0, d4
+
+    vqmovun.s16      d6, q8                    ; clip_pixel
+    vqmovun.s16      d7, q9
+
+    vst1.32          {d6[0]}, [r12], r2
+    vst1.32          {d6[1]}, [r12], r2
+    vst1.32          {d7[0]}, [r12], r2
+    vst1.32          {d7[1]}, [r12]
+
+    bx               lr
+    ENDP             ; |vp9_short_idct4x4_1_add_neon|
+
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
new file mode 100644
index 0000000..640fb93
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
@@ -0,0 +1,190 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_idct4x4_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct4x4_add_neon| PROC
+
+    ; The 2D transform is done with two passes which are actually pretty
+    ; similar. We first transform the rows. This is done by transposing
+    ; the inputs, doing an SIMD column transform (the columns are the
+    ; transposed rows) and then transpose the results (so that it goes back
+    ; in normal/row positions). Then, we transform the columns by doing
+    ; another SIMD column transform.
+    ; So, two passes of a transpose followed by a column transform.
+
+    ; load the inputs into q8-q9, d16-d19
+    vld1.s16        {q8,q9}, [r0]!
+
+    ; generate scalar constants
+    ; cospi_8_64 = 15137 = 0x3b21
+    mov             r0, #0x3b00
+    add             r0, #0x21
+    ; cospi_16_64 = 11585 = 0x2d41
+    mov             r3, #0x2d00
+    add             r3, #0x41
+    ; cospi_24_64 = 6270 = 0x 187e
+    mov             r12, #0x1800
+    add             r12, #0x7e
+
+    ; transpose the input data
+    ; 00 01 02 03   d16
+    ; 10 11 12 13   d17
+    ; 20 21 22 23   d18
+    ; 30 31 32 33   d19
+    vtrn.16         d16, d17
+    vtrn.16         d18, d19
+
+    ; generate constant vectors
+    vdup.16         d20, r0         ; replicate cospi_8_64
+    vdup.16         d21, r3         ; replicate cospi_16_64
+
+    ; 00 10 02 12   d16
+    ; 01 11 03 13   d17
+    ; 20 30 22 32   d18
+    ; 21 31 23 33   d19
+    vtrn.32         q8, q9
+    ; 00 10 20 30   d16
+    ; 01 11 21 31   d17
+    ; 02 12 22 32   d18
+    ; 03 13 23 33   d19
+
+    vdup.16         d22, r12        ; replicate cospi_24_64
+
+    ; do the transform on transposed rows
+
+    ; stage 1
+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
+
+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
+
+    ; (input[0] + input[2]) * cospi_16_64;
+    ; (input[0] - input[2]) * cospi_16_64;
+    vmull.s16 q13, d23, d21
+    vmull.s16 q14, d24, d21
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
+    vmlsl.s16 q15, d19, d20
+    vmlal.s16 q1,  d19, d22
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16 q8,  q13, q14
+    vsub.s16 q9,  q13, q14
+    vswp     d18, d19
+
+    ; transpose the results
+    ; 00 01 02 03   d16
+    ; 10 11 12 13   d17
+    ; 20 21 22 23   d18
+    ; 30 31 32 33   d19
+    vtrn.16         d16, d17
+    vtrn.16         d18, d19
+    ; 00 10 02 12   d16
+    ; 01 11 03 13   d17
+    ; 20 30 22 32   d18
+    ; 21 31 23 33   d19
+    vtrn.32         q8, q9
+    ; 00 10 20 30   d16
+    ; 01 11 21 31   d17
+    ; 02 12 22 32   d18
+    ; 03 13 23 33   d19
+
+    ; do the transform on columns
+
+    ; stage 1
+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
+
+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
+
+    ; (input[0] + input[2]) * cospi_16_64;
+    ; (input[0] - input[2]) * cospi_16_64;
+    vmull.s16 q13, d23, d21
+    vmull.s16 q14, d24, d21
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
+    vmlsl.s16 q15, d19, d20
+    vmlal.s16 q1,  d19, d22
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16 q8,  q13, q14
+    vsub.s16 q9,  q13, q14
+
+    ; The results are in two registers, one of them being swapped. This will
+    ; be taken care of by loading the 'dest' value in a swapped fashion and
+    ; also storing them in the same swapped fashion.
+    ; temp_out[0, 1] = d16, d17 = q8
+    ; temp_out[2, 3] = d19, d18 = q9 swapped
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+    vrshr.s16 q8, q8, #4
+    vrshr.s16 q9, q9, #4
+
+    vld1.32 {d26[0]}, [r1], r2
+    vld1.32 {d26[1]}, [r1], r2
+    vld1.32 {d27[1]}, [r1], r2
+    vld1.32 {d27[0]}, [r1]  ; no post-increment
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    vaddw.u8 q8, q8, d26
+    vaddw.u8 q9, q9, d27
+
+    ; clip_pixel
+    vqmovun.s16 d26, q8
+    vqmovun.s16 d27, q9
+
+    ; do the stores in reverse order with negative post-increment, by changing
+    ; the sign of the stride
+    rsb r2, r2, #0
+    vst1.32 {d27[0]}, [r1], r2
+    vst1.32 {d27[1]}, [r1], r2
+    vst1.32 {d26[1]}, [r1], r2
+    vst1.32 {d26[0]}, [r1]  ; no post-increment
+    bx              lr
+    ENDP  ; |vp9_short_idct4x4_add_neon|
+
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
new file mode 100644
index 0000000..923804f
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
@@ -0,0 +1,88 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_short_idct8x8_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct8x8_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 5)
+    add              r0, r0, #16               ; + (1 <<((5) - 1))
+    asr              r0, r0, #5                ; >> 5
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    ; load destination data
+    vld1.64          {d2}, [r1], r2
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r2
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r2
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r2
+    vld1.64          {d17}, [r1]
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vp9_short_idct8x8_1_add_neon|
+
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
index f829665..a744f59 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -9,6 +9,7 @@
 ;
 
     EXPORT  |vp9_short_idct8x8_add_neon|
+    EXPORT  |vp9_short_idct10_8x8_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -24,191 +25,149 @@
     ; stage 1
     vdup.16         d0, r3                    ; duplicate cospi_28_64
     vdup.16         d1, r4                    ; duplicate cospi_4_64
+    vdup.16         d2, r5                    ; duplicate cospi_12_64
+    vdup.16         d3, r6                    ; duplicate cospi_20_64
 
     ; input[1] * cospi_28_64
     vmull.s16       q2, d18, d0
     vmull.s16       q3, d19, d0
 
-    ; input[7] * cospi_4_64
-    vmull.s16       q4, d30, d1
-    vmull.s16       q5, d31, d1
+    ; input[5] * cospi_12_64
+    vmull.s16       q5, d26, d2
+    vmull.s16       q6, d27, d2
 
     ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vsub.s32        q6, q2, q4
-    vsub.s32        q7, q3, q5
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q5, d22, d3
+    vmlsl.s16       q6, d23, d3
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q6, #14               ; >> 14
-    vqrshrn.s32     d9, q7, #14               ; >> 14
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
 
     ; input[1] * cospi_4_64
     vmull.s16       q2, d18, d1
     vmull.s16       q3, d19, d1
 
-    ; input[7] * cospi_28_64
-    vmull.s16       q1, d30, d0
-    vmull.s16       q5, d31, d0
-
-    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vadd.s32        q2, q2, q1
-    vadd.s32        q3, q3, q5
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d14, q2, #14              ; >> 14
-    vqrshrn.s32     d15, q3, #14              ; >> 14
-
-    vdup.16         d0, r5                    ; duplicate cospi_12_64
-    vdup.16         d1, r6                    ; duplicate cospi_20_64
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q2, d26, d0
-    vmull.s16       q3, d27, d0
-
-    ; input[3] * cospi_20_64
-    vmull.s16       q5, d22, d1
-    vmull.s16       q6, d23, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vsub.s32        q2, q2, q5
-    vsub.s32        q3, q3, q6
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q2, #14              ; >> 14
-    vqrshrn.s32     d11, q3, #14              ; >> 14
-
     ; input[5] * cospi_20_64
-    vmull.s16       q2, d26, d1
-    vmull.s16       q3, d27, d1
+    vmull.s16       q9, d26, d3
+    vmull.s16       q13, d27, d3
 
-    ; input[3] * cospi_12_64
-    vmull.s16       q9, d22, d0
-    vmull.s16       q15, d23, d0
+    ; input[1]*cospi_4_64+input[7]*cospi_28_64
+    vmlal.s16       q2, d30, d0
+    vmlal.s16       q3, d31, d0
 
     ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vadd.s32        q0, q2, q9
-    vadd.s32        q1, q3, q15
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q13, d23, d2
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q0, #14              ; >> 14
-    vqrshrn.s32     d13, q1, #14              ; >> 14
+    vqrshrn.s32     d14, q2, #14              ; >> 14
+    vqrshrn.s32     d15, q3, #14              ; >> 14
 
     ; stage 2 & stage 3 - even half
     vdup.16         d0, r7                    ; duplicate cospi_16_64
 
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q13, #14              ; >> 14
+
     ; input[0] * cospi_16_64
     vmull.s16       q2, d16, d0
     vmull.s16       q3, d17, d0
 
-    ; input[2] * cospi_16_64
-    vmull.s16       q9,  d24, d0
-    vmull.s16       q11, d25, d0
+    ; input[0] * cospi_16_64
+    vmull.s16       q13, d16, d0
+    vmull.s16       q15, d17, d0
 
     ; (input[0] + input[2]) * cospi_16_64
-    vadd.s32        q9, q2, q9
-    vadd.s32        q11, q3, q11
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q9, #14              ; >> 14
-    vqrshrn.s32     d19, q11, #14             ; >> 14
+    vmlal.s16       q2,  d24, d0
+    vmlal.s16       q3, d25, d0
 
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
+    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16       q13, d24, d0
+    vmlsl.s16       q15, d25, d0
 
-    ; input[2] * cospi_16_64
-    vmull.s16       q13,  d24, d0
-    vmull.s16       q15, d25, d0
+    vdup.16         d0, r8                    ; duplicate cospi_24_64
+    vdup.16         d1, r9                    ; duplicate cospi_8_64
 
-    ; (input[0] - input[2]) * cospi_16_64
-    vsub.s32        q2, q2, q13
-    vsub.s32        q3, q3, q15
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q2, #14              ; >> 14
+    vqrshrn.s32     d19, q3, #14              ; >> 14
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q2, #14              ; >> 14
-    vqrshrn.s32     d23, q3, #14              ; >> 14
+    vqrshrn.s32     d22, q13, #14              ; >> 14
+    vqrshrn.s32     d23, q15, #14              ; >> 14
 
     ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
     ; input[1] * cospi_24_64
     vmull.s16       q2, d20, d0
     vmull.s16       q3, d21, d0
 
-    ; input[3] * cospi_8_64
-    vmull.s16       q13, d28, d1
-    vmull.s16       q15, d29, d1
+    ; input[1] * cospi_8_64
+    vmull.s16       q8, d20, d1
+    vmull.s16       q12, d21, d1
 
     ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vsub.s32        q2, q2, q13
-    vsub.s32        q3, q3, q15
+    vmlsl.s16       q2, d28, d1
+    vmlsl.s16       q3, d29, d1
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q8, d28, d0
+    vmlal.s16       q12, d29, d0
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
     vqrshrn.s32     d26, q2, #14              ; >> 14
     vqrshrn.s32     d27, q3, #14              ; >> 14
 
-    ; input[1] * cospi_8_64
-    vmull.s16       q2, d20, d1
-    vmull.s16       q3, d21, d1
-
-    ; input[3] * cospi_24_64
-    vmull.s16       q8, d28, d0
-    vmull.s16       q10, d29, d0
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vadd.s32        q0, q2, q8
-    vadd.s32        q1, q3, q10
-
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q0, #14              ; >> 14
-    vqrshrn.s32     d31, q1, #14              ; >> 14
-
+    vqrshrn.s32     d30, q8, #14              ; >> 14
+    vqrshrn.s32     d31, q12, #14              ; >> 14
 
     vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
     vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
     vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
     vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
 
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
     ; stage 2 - odd half
     vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
     vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
     vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
     vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
 
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
     ; step2[6] * cospi_16_64
     vmull.s16       q9, d28, d16
     vmull.s16       q10, d29, d16
 
-    ; step2[5] * cospi_16_64
-    vmull.s16       q11, d26, d16
-    vmull.s16       q12, d27, d16
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
 
     ; (step2[6] - step2[5]) * cospi_16_64
-    vsub.s32        q9, q9, q11
-    vsub.s32        q10, q10, q12
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
     vqrshrn.s32     d10, q9, #14              ; >> 14
     vqrshrn.s32     d11, q10, #14             ; >> 14
 
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[5] * cospi_16_64
-    vmull.s16       q11, d26, d16
-    vmull.s16       q12, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vadd.s32        q9, q9, q11
-    vadd.s32        q10, q10, q12
-
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
 
     ; stage 4
     vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
@@ -247,14 +206,11 @@
 
 |vp9_short_idct8x8_add_neon| PROC
     push            {r4-r9}
-    vld1.s16        {q8}, [r0]!
-    vld1.s16        {q9}, [r0]!
-    vld1.s16        {q10}, [r0]!
-    vld1.s16        {q11}, [r0]!
-    vld1.s16        {q12}, [r0]!
-    vld1.s16        {q13}, [r0]!
-    vld1.s16        {q14}, [r0]!
-    vld1.s16        {q15}, [r0]!
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -349,8 +305,215 @@
     vst1.64         {d6}, [r0], r2
     vst1.64         {d7}, [r0], r2
 
+    vpop            {d8-d15}
     pop             {r4-r9}
     bx              lr
     ENDP  ; |vp9_short_idct8x8_add_neon|
 
+;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vp9_short_idct10_8x8_add_neon| PROC
+    push            {r4-r9}
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+
+    ; First transform rows
+    ; stage 1
+    ; The following instructions use vqrdmulh to do the
+    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
+    ; multiply and shift the result by 16 bits instead of 14 bits. So we need
+    ; to double the constants before multiplying to compensate this.
+    mov             r12, r3, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_28_64*2
+    mov             r12, r4, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_28_64)
+    vqrdmulh.s16    q4, q9, q0
+
+    mov             r12, r6, lsl #1
+    rsb             r12, #0
+    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_4_64)
+    vqrdmulh.s16    q7, q9, q1
+
+    mov             r12, r5, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_12_64*2
+
+    ; dct_const_round_shift(- input[3] * cospi_20_64)
+    vqrdmulh.s16    q5, q11, q0
+
+    mov             r12, r7, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_16_64*2
+
+    ; dct_const_round_shift(input[3] * cospi_12_64)
+    vqrdmulh.s16    q6, q11, q1
+
+    ; stage 2 & stage 3 - even half
+    mov             r12, r8, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_24_64*2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrdmulh.s16    q9, q8, q0
+
+    mov             r12, r9, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_8_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_24_64)
+    vqrdmulh.s16    q13, q10, q1
+
+    ; dct_const_round_shift(input[1] * cospi_8_64)
+    vqrdmulh.s16    q15, q10, q0
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+
+    ; Transpose the matrix
+    TRANSPOSE8X8
+
+    ; Then transform columns
+    IDCT8x8_1D
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+
+    vpop            {d8-d15}
+    pop             {r4-r9}
+    bx              lr
+    ENDP  ; |vp9_short_idct10_8x8_add_neon|
+
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
new file mode 100644
index 0000000..963ef35
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
@@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_iht4x4_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
+    ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
+    ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
+    ; into d16-d19 registers. This macro will touch q10- q15 registers and use
+    ; them as buffer during calculation.
+    MACRO
+    IDCT4x4_1D
+    ; stage 1
+    vadd.s16    d23, d16, d18   ; (input[0] + input[2])
+    vsub.s16    d24, d16, d18   ; (input[0] - input[2])
+
+    vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64
+    vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64
+    vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64
+    vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q10, #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16    q8,  q13, q14
+    vsub.s16    q9,  q13, q14
+    vswp        d18, d19
+    MEND
+
+    ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
+    ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
+    ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
+    ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
+    ; q14,q15 registers and use them as buffer during calculation.
+    MACRO
+    IADST4x4_1D
+    vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0
+    vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0
+    vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1
+    vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2
+    vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2
+    vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit
+    vaddw.s16   q15, q15, d19   ; x0 + x3
+    vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3
+    vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2
+    vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3
+
+    vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5
+    vadd.s32    q10, q10, q8
+    vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6
+    vdup.32     q8, r0          ; duplicate sinpi_3_9
+    vsub.s32    q11, q11, q9
+    vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7
+
+    vadd.s32    q13, q10, q12   ; s0 = x0 + x3
+    vadd.s32    q10, q10, q11   ; x0 + x1
+    vadd.s32    q14, q11, q12   ; s1 = x1 + x3
+    vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d16, q13, #14
+    vqrshrn.s32 d17, q14, #14
+    vqrshrn.s32 d18, q15, #14
+    vqrshrn.s32 d19, q10, #14
+    MEND
+
+    ; Generate cosine constants in d6 - d8 for the IDCT
+    MACRO
+    GENERATE_COSINE_CONSTANTS
+    ; cospi_8_64 = 15137 = 0x3b21
+    mov         r0, #0x3b00
+    add         r0, #0x21
+    ; cospi_16_64 = 11585 = 0x2d41
+    mov         r3, #0x2d00
+    add         r3, #0x41
+    ; cospi_24_64 = 6270 = 0x187e
+    mov         r12, #0x1800
+    add         r12, #0x7e
+
+    ; generate constant vectors
+    vdup.16     d0, r0          ; duplicate cospi_8_64
+    vdup.16     d1, r3          ; duplicate cospi_16_64
+    vdup.16     d2, r12         ; duplicate cospi_24_64
+    MEND
+
+    ; Generate sine constants in d1 - d4 for the IADST.
+    MACRO
+    GENERATE_SINE_CONSTANTS
+    ; sinpi_1_9 = 5283 = 0x14A3
+    mov         r0, #0x1400
+    add         r0, #0xa3
+    ; sinpi_2_9 = 9929 = 0x26C9
+    mov         r3, #0x2600
+    add         r3, #0xc9
+    ; sinpi_4_9 = 15212 = 0x3B6C
+    mov         r12, #0x3b00
+    add         r12, #0x6c
+
+    ; generate constant vectors
+    vdup.16     d3, r0          ; duplicate sinpi_1_9
+
+    ; sinpi_3_9 = 13377 = 0x3441
+    mov         r0, #0x3400
+    add         r0, #0x41
+
+    vdup.16     d4, r3          ; duplicate sinpi_2_9
+    vdup.16     d5, r12         ; duplicate sinpi_4_9
+    vdup.16     q3, r0          ; duplicate sinpi_3_9
+    MEND
+
+    ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
+    MACRO
+    TRANSPOSE4X4
+    vtrn.16     d16, d17
+    vtrn.16     d18, d19
+    vtrn.32     q8, q9
+    MEND
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest,
+;                               int dest_stride, int tx_type)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+; r3  int tx_type)
+; This function will only handle tx_type of 1,2,3.
+|vp9_short_iht4x4_add_neon| PROC
+
+    ; load the inputs into d16-d19
+    vld1.s16    {q8,q9}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE4X4
+
+    ; decide the type of transform
+    cmp         r3, #2
+    beq         idct_iadst
+    cmp         r3, #3
+    beq         iadst_iadst
+
+iadst_idct
+    ; generate constants
+    GENERATE_COSINE_CONSTANTS
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IDCT4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IADST4x4_1D
+
+    b end_vp9_short_iht4x4_add_neon
+
+idct_iadst
+    ; generate constants
+    GENERATE_COSINE_CONSTANTS
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IADST4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IDCT4x4_1D
+
+    b end_vp9_short_iht4x4_add_neon
+
+iadst_iadst
+    ; generate constants
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IADST4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IADST4x4_1D
+
+end_vp9_short_iht4x4_add_neon
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+    vrshr.s16   q8, q8, #4
+    vrshr.s16   q9, q9, #4
+
+    vld1.32     {d26[0]}, [r1], r2
+    vld1.32     {d26[1]}, [r1], r2
+    vld1.32     {d27[0]}, [r1], r2
+    vld1.32     {d27[1]}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    vaddw.u8    q8, q8, d26
+    vaddw.u8    q9, q9, d27
+
+    ; clip_pixel
+    vqmovun.s16 d26, q8
+    vqmovun.s16 d27, q9
+
+    ; do the stores in reverse order with negative post-increment, by changing
+    ; the sign of the stride
+    rsb         r2, r2, #0
+    vst1.32     {d27[1]}, [r1], r2
+    vst1.32     {d27[0]}, [r1], r2
+    vst1.32     {d26[1]}, [r1], r2
+    vst1.32     {d26[0]}, [r1]  ; no post-increment
+    bx          lr
+    ENDP  ; |vp9_short_iht4x4_add_neon|
+
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
new file mode 100644
index 0000000..bab9cb4
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
@@ -0,0 +1,696 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_iht8x8_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Generate IADST constants in r0 - r12 for the IADST.
+    MACRO
+    GENERATE_IADST_CONSTANTS
+    ; generate  cospi_2_64  = 16305
+    mov             r0, #0x3f00
+    add             r0, #0xb1
+
+    ; generate cospi_30_64 = 1606
+    mov             r1, #0x600
+    add             r1, #0x46
+
+    ; generate cospi_10_64 = 14449
+    mov             r2, #0x3800
+    add             r2, #0x71
+
+    ; generate cospi_22_64 = 7723
+    mov             r3, #0x1e00
+    add             r3, #0x2b
+
+    ; generate cospi_18_64 = 10394
+    mov             r4, #0x2800
+    add             r4, #0x9a
+
+    ; generate cospi_14_64 = 12665
+    mov             r5, #0x3100
+    add             r5, #0x79
+
+    ; generate cospi_26_64 = 4756
+    mov             r6, #0x1200
+    add             r6, #0x94
+
+    ; generate cospi_6_64  = 15679
+    mov             r7, #0x3d00
+    add             r7, #0x3f
+
+    ; generate cospi_8_64  = 15137
+    mov             r8, #0x3b00
+    add             r8, #0x21
+
+    ; generate cospi_24_64 = 6270
+    mov             r9, #0x1800
+    add             r9, #0x7e
+
+    ; generate 0
+    mov             r10, #0
+
+    ; generate  cospi_16_64 = 11585
+    mov             r12, #0x2d00
+    add             r12, #0x41
+    MEND
+
+    ; Generate IDCT constants in r3 - r9 for the IDCT.
+    MACRO
+    GENERATE_IDCT_CONSTANTS
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+    MEND
+
+    ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
+    MACRO
+    TRANSPOSE8X8
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    MEND
+
+    ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
+    ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
+    ; will be stored back into q8-q15 registers. This macro will touch q0-q7
+    ; registers and use them as buffer during calculation.
+    MACRO
+    IDCT8x8_1D
+    ; stage 1
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r4                    ; duplicate cospi_4_64
+    vdup.16         d2, r5                    ; duplicate cospi_12_64
+    vdup.16         d3, r6                    ; duplicate cospi_20_64
+
+    ; input[1] * cospi_28_64
+    vmull.s16       q2, d18, d0
+    vmull.s16       q3, d19, d0
+
+    ; input[5] * cospi_12_64
+    vmull.s16       q5, d26, d2
+    vmull.s16       q6, d27, d2
+
+    ; input[1]*cospi_28_64-input[7]*cospi_4_64
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q5, d22, d3
+    vmlsl.s16       q6, d23, d3
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
+
+    ; input[1] * cospi_4_64
+    vmull.s16       q2, d18, d1
+    vmull.s16       q3, d19, d1
+
+    ; input[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q13, d27, d3
+
+    ; input[1]*cospi_4_64+input[7]*cospi_28_64
+    vmlal.s16       q2, d30, d0
+    vmlal.s16       q3, d31, d0
+
+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q13, d23, d2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d14, q2, #14              ; >> 14
+    vqrshrn.s32     d15, q3, #14              ; >> 14
+
+    ; stage 2 & stage 3 - even half
+    vdup.16         d0, r7                    ; duplicate cospi_16_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q13, #14             ; >> 14
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q2, d16, d0
+    vmull.s16       q3, d17, d0
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q13, d16, d0
+    vmull.s16       q15, d17, d0
+
+    ; (input[0] + input[2]) * cospi_16_64
+    vmlal.s16       q2,  d24, d0
+    vmlal.s16       q3, d25, d0
+
+    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16       q13, d24, d0
+    vmlsl.s16       q15, d25, d0
+
+    vdup.16         d0, r8                    ; duplicate cospi_24_64
+    vdup.16         d1, r9                    ; duplicate cospi_8_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q2, #14              ; >> 14
+    vqrshrn.s32     d19, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d22, q13, #14             ; >> 14
+    vqrshrn.s32     d23, q15, #14             ; >> 14
+
+    ; input[1] * cospi_24_64
+    vmull.s16       q2, d20, d0
+    vmull.s16       q3, d21, d0
+
+    ; input[1] * cospi_8_64
+    vmull.s16       q8, d20, d1
+    vmull.s16       q12, d21, d1
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlsl.s16       q2, d28, d1
+    vmlsl.s16       q3, d29, d1
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q8, d28, d0
+    vmlal.s16       q12, d29, d0
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d26, q2, #14              ; >> 14
+    vqrshrn.s32     d27, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d30, q8, #14              ; >> 14
+    vqrshrn.s32     d31, q12, #14             ; >> 14
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14             ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+    MEND
+
+    ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
+    ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
+    ; output will be stored back into q8-q15 registers. This macro will touch
+    ; q0 - q7 registers and use them as buffer during calculation.
+    MACRO
+    IADST8X8_1D
+    vdup.16         d14, r0                   ; duplicate cospi_2_64
+    vdup.16         d15, r1                   ; duplicate cospi_30_64
+
+    ; cospi_2_64  * x0
+    vmull.s16       q1, d30, d14
+    vmull.s16       q2, d31, d14
+
+    ; cospi_30_64 * x0
+    vmull.s16       q3, d30, d15
+    vmull.s16       q4, d31, d15
+
+    vdup.16         d30, r4                   ; duplicate cospi_18_64
+    vdup.16         d31, r5                   ; duplicate cospi_14_64
+
+    ; s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+    vmlal.s16       q1, d16, d15
+    vmlal.s16       q2, d17, d15
+
+    ; s1 = cospi_30_64 * x0 - cospi_2_64  * x1
+    vmlsl.s16       q3, d16, d14
+    vmlsl.s16       q4, d17, d14
+
+    ; cospi_18_64 * x4
+    vmull.s16       q5, d22, d30
+    vmull.s16       q6, d23, d30
+
+    ; cospi_14_64 * x4
+    vmull.s16       q7, d22, d31
+    vmull.s16       q8, d23, d31
+
+    ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+    vmlal.s16       q5, d24, d31
+    vmlal.s16       q6, d25, d31
+
+    ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
+    vmlsl.s16       q7, d24, d30
+    vmlsl.s16       q8, d25, d30
+
+    ; (s0 + s4)
+    vadd.s32        q11, q1, q5
+    vadd.s32        q12, q2, q6
+
+    vdup.16         d0, r2                   ; duplicate cospi_10_64
+    vdup.16         d1, r3                   ; duplicate cospi_22_64
+
+    ; (s0 - s4)
+    vsub.s32        q1, q1, q5
+    vsub.s32        q2, q2, q6
+
+    ; x0 = dct_const_round_shift(s0 + s4);
+    vqrshrn.s32     d22, q11, #14             ; >> 14
+    vqrshrn.s32     d23, q12, #14             ; >> 14
+
+    ; (s1 + s5)
+    vadd.s32        q12, q3, q7
+    vadd.s32        q15, q4, q8
+
+    ; (s1 - s5)
+    vsub.s32        q3, q3, q7
+    vsub.s32        q4, q4, q8
+
+    ; x4 = dct_const_round_shift(s0 - s4);
+    vqrshrn.s32     d2, q1, #14               ; >> 14
+    vqrshrn.s32     d3, q2, #14               ; >> 14
+
+    ; x1 = dct_const_round_shift(s1 + s5);
+    vqrshrn.s32     d24, q12, #14             ; >> 14
+    vqrshrn.s32     d25, q15, #14             ; >> 14
+
+    ; x5 = dct_const_round_shift(s1 - s5);
+    vqrshrn.s32     d6, q3, #14               ; >> 14
+    vqrshrn.s32     d7, q4, #14               ; >> 14
+
+    ; cospi_10_64 * x2
+    vmull.s16       q4, d26, d0
+    vmull.s16       q5, d27, d0
+
+    ; cospi_22_64 * x2
+    vmull.s16       q2, d26, d1
+    vmull.s16       q6, d27, d1
+
+    vdup.16         d30, r6                   ; duplicate cospi_26_64
+    vdup.16         d31, r7                   ; duplicate cospi_6_64
+
+    ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+    vmlal.s16       q4, d20, d1
+    vmlal.s16       q5, d21, d1
+
+    ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+    vmlsl.s16       q2, d20, d0
+    vmlsl.s16       q6, d21, d0
+
+    ; cospi_26_64 * x6
+    vmull.s16       q0, d18, d30
+    vmull.s16       q13, d19, d30
+
+    ; s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+    vmlal.s16       q0, d28, d31
+    vmlal.s16       q13, d29, d31
+
+    ; cospi_6_64  * x6
+    vmull.s16       q10, d18, d31
+    vmull.s16       q9, d19, d31
+
+    ; s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+    vmlsl.s16       q10, d28, d30
+    vmlsl.s16       q9, d29, d30
+
+    ; (s3 + s7)
+    vadd.s32        q14, q2, q10
+    vadd.s32        q15, q6, q9
+
+    ; (s3 - s7)
+    vsub.s32        q2, q2, q10
+    vsub.s32        q6, q6, q9
+
+    ; x3 = dct_const_round_shift(s3 + s7);
+    vqrshrn.s32     d28, q14, #14             ; >> 14
+    vqrshrn.s32     d29, q15, #14             ; >> 14
+
+    ; x7 = dct_const_round_shift(s3 - s7);
+    vqrshrn.s32     d4, q2, #14               ; >> 14
+    vqrshrn.s32     d5, q6, #14               ; >> 14
+
+    ; (s2 + s6)
+    vadd.s32        q9, q4, q0
+    vadd.s32        q10, q5, q13
+
+    ; (s2 - s6)
+    vsub.s32        q4, q4, q0
+    vsub.s32        q5, q5, q13
+
+    vdup.16         d30, r8                   ; duplicate cospi_8_64
+    vdup.16         d31, r9                   ; duplicate cospi_24_64
+
+    ; x2 = dct_const_round_shift(s2 + s6);
+    vqrshrn.s32     d18, q9, #14              ; >> 14
+    vqrshrn.s32     d19, q10, #14             ; >> 14
+
+    ; x6 = dct_const_round_shift(s2 - s6);
+    vqrshrn.s32     d8, q4, #14               ; >> 14
+    vqrshrn.s32     d9, q5, #14               ; >> 14
+
+    ; cospi_8_64  * x4
+    vmull.s16       q5, d2, d30
+    vmull.s16       q6, d3, d30
+
+    ; cospi_24_64 * x4
+    vmull.s16       q7, d2, d31
+    vmull.s16       q0, d3, d31
+
+    ; s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+    vmlal.s16       q5, d6, d31
+    vmlal.s16       q6, d7, d31
+
+    ; s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+    vmlsl.s16       q7, d6, d30
+    vmlsl.s16       q0, d7, d30
+
+    ; cospi_8_64  * x7
+    vmull.s16       q1, d4, d30
+    vmull.s16       q3, d5, d30
+
+    ; cospi_24_64 * x7
+    vmull.s16       q10, d4, d31
+    vmull.s16       q2, d5, d31
+
+    ; s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+    vmlsl.s16       q1, d8, d31
+    vmlsl.s16       q3, d9, d31
+
+    ; s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+    vmlal.s16       q10, d8, d30
+    vmlal.s16       q2, d9, d30
+
+    vadd.s16        q8, q11, q9               ; x0 = s0 + s2;
+
+    vsub.s16        q11, q11, q9              ; x2 = s0 - s2;
+
+    vadd.s16        q4, q12, q14              ; x1 = s1 + s3;
+
+    vsub.s16        q12, q12, q14             ; x3 = s1 - s3;
+
+    ; (s4 + s6)
+    vadd.s32        q14, q5, q1
+    vadd.s32        q15, q6, q3
+
+    ; (s4 - s6)
+    vsub.s32        q5, q5, q1
+    vsub.s32        q6, q6, q3
+
+    ; x4 = dct_const_round_shift(s4 + s6);
+    vqrshrn.s32     d18, q14, #14             ; >> 14
+    vqrshrn.s32     d19, q15, #14             ; >> 14
+
+    ; x6 = dct_const_round_shift(s4 - s6);
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
+
+    ; (s5 + s7)
+    vadd.s32        q1, q7, q10
+    vadd.s32        q3, q0, q2
+
+    ; (s5 - s7))
+    vsub.s32        q7, q7, q10
+    vsub.s32        q0, q0, q2
+
+    ; x5 = dct_const_round_shift(s5 + s7);
+    vqrshrn.s32     d28, q1, #14               ; >> 14
+    vqrshrn.s32     d29, q3, #14               ; >> 14
+
+    ; x7 = dct_const_round_shift(s5 - s7);
+    vqrshrn.s32     d14, q7, #14              ; >> 14
+    vqrshrn.s32     d15, q0, #14              ; >> 14
+
+    vdup.16         d30, r12                  ; duplicate cospi_16_64
+
+    ; cospi_16_64 * x2
+    vmull.s16       q2, d22, d30
+    vmull.s16       q3, d23, d30
+
+    ; cospi_6_64  * x6
+    vmull.s16       q13, d22, d30
+    vmull.s16       q1, d23, d30
+
+    ; cospi_16_64 * x2 + cospi_16_64  * x3;
+    vmlal.s16       q2, d24, d30
+    vmlal.s16       q3, d25, d30
+
+    ; cospi_16_64 * x2 - cospi_16_64  * x3;
+    vmlsl.s16       q13, d24, d30
+    vmlsl.s16       q1, d25, d30
+
+    ; x2 = dct_const_round_shift(s2);
+    vqrshrn.s32     d4, q2, #14               ; >> 14
+    vqrshrn.s32     d5, q3, #14               ; >> 14
+
+    ;x3 = dct_const_round_shift(s3);
+    vqrshrn.s32     d24, q13, #14             ; >> 14
+    vqrshrn.s32     d25, q1, #14              ; >> 14
+
+    ; cospi_16_64 * x6
+    vmull.s16       q13, d10, d30
+    vmull.s16       q1, d11, d30
+
+    ; cospi_6_64  * x6
+    vmull.s16       q11, d10, d30
+    vmull.s16       q0, d11, d30
+
+    ; cospi_16_64 * x6 + cospi_16_64  * x7;
+    vmlal.s16       q13, d14, d30
+    vmlal.s16       q1, d15, d30
+
+    ; cospi_16_64 * x6 - cospi_16_64  * x7;
+    vmlsl.s16       q11, d14, d30
+    vmlsl.s16       q0, d15, d30
+
+    ; x6 = dct_const_round_shift(s6);
+    vqrshrn.s32     d20, q13, #14             ; >> 14
+    vqrshrn.s32     d21, q1, #14              ; >> 14
+
+    ;x7 = dct_const_round_shift(s7);
+    vqrshrn.s32     d12, q11, #14             ; >> 14
+    vqrshrn.s32     d13, q0, #14              ; >> 14
+
+    vdup.16         q5, r10                   ; duplicate 0
+
+    vsub.s16        q9, q5, q9                ; output[1] = -x4;
+    vsub.s16        q11, q5, q2               ; output[3] = -x2;
+    vsub.s16        q13, q5, q6               ; output[5] = -x7;
+    vsub.s16        q15, q5, q4               ; output[7] = -x1;
+    MEND
+
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest,
+;                               int dest_stride, int tx_type)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+; r3  int tx_type)
+; This function will only handle tx_type of 1,2,3.
+|vp9_short_iht8x8_add_neon| PROC
+
+    ; load the inputs into d16-d19
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    push            {r0-r10}
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; decide the type of transform
+    cmp         r3, #2
+    beq         idct_iadst
+    cmp         r3, #3
+    beq         iadst_iadst
+
+iadst_idct
+    ; generate IDCT constants
+    GENERATE_IDCT_CONSTANTS
+
+    ; first transform rows
+    IDCT8x8_1D
+
+    ; transpose the matrix
+    TRANSPOSE8X8
+
+    ; generate IADST constants
+    GENERATE_IADST_CONSTANTS
+
+    ; then transform columns
+    IADST8X8_1D
+
+    b end_vp9_short_iht8x8_add_neon
+
+idct_iadst
+    ; generate IADST constants
+    GENERATE_IADST_CONSTANTS
+
+    ; first transform rows
+    IADST8X8_1D
+
+    ; transpose the matrix
+    TRANSPOSE8X8
+
+    ; generate IDCT constants
+    GENERATE_IDCT_CONSTANTS
+
+    ; then transform columns
+    IDCT8x8_1D
+
+    b end_vp9_short_iht8x8_add_neon
+
+iadst_iadst
+    ; generate IADST constants
+    GENERATE_IADST_CONSTANTS
+
+    ; first transform rows
+    IADST8X8_1D
+
+    ; transpose the matrix
+    TRANSPOSE8X8
+
+    ; then transform columns
+    IADST8X8_1D
+
+end_vp9_short_iht8x8_add_neon
+    pop            {r0-r10}
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+    bx          lr
+    ENDP  ; |vp9_short_iht8x8_add_neon|
+
+    END
diff --git a/libvpx/vp9/common/generic/vp9_systemdependent.c b/libvpx/vp9/common/generic/vp9_systemdependent.c
index 79092cd..f144721 100644
--- a/libvpx/vp9/common/generic/vp9_systemdependent.c
+++ b/libvpx/vp9/common/generic/vp9_systemdependent.c
@@ -13,6 +13,7 @@
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-void vp9_machine_specific_config(VP9_COMMON *ctx) {
+void vp9_machine_specific_config(VP9_COMMON *cm) {
+  (void)cm;
   vp9_rtcd();
 }
diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c
index 554a317..864e27e 100644
--- a/libvpx/vp9/common/vp9_alloccommon.c
+++ b/libvpx/vp9/common/vp9_alloccommon.c
@@ -31,40 +31,30 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) {
     vpx_memset(&mi[i * stride], 0, sizeof(MODE_INFO));
 }
 
-void vp9_update_mode_info_in_image(VP9_COMMON *cm, MODE_INFO *mi) {
-  int i, j;
-
-  // For each in image mode_info element set the in image flag to 1
-  for (i = 0; i < cm->mi_rows; i++) {
-    MODE_INFO *ptr = mi;
-    for (j = 0; j < cm->mi_cols; j++) {
-      ptr->mbmi.mb_in_image = 1;
-      ptr++;  // Next element in the row
-    }
-
-    // Step over border element at start of next row
-    mi += cm->mode_info_stride;
-  }
-}
-
-void vp9_free_frame_buffers(VP9_COMMON *oci) {
+void vp9_free_frame_buffers(VP9_COMMON *cm) {
   int i;
 
   for (i = 0; i < NUM_YV12_BUFFERS; i++)
-    vp9_free_frame_buffer(&oci->yv12_fb[i]);
+    vp9_free_frame_buffer(&cm->yv12_fb[i]);
 
-  vp9_free_frame_buffer(&oci->post_proc_buffer);
+  vp9_free_frame_buffer(&cm->post_proc_buffer);
 
-  vpx_free(oci->mip);
-  vpx_free(oci->prev_mip);
-  vpx_free(oci->above_seg_context);
+  vpx_free(cm->mip);
+  vpx_free(cm->prev_mip);
+  vpx_free(cm->above_seg_context);
+  vpx_free(cm->last_frame_seg_map);
+  vpx_free(cm->mi_grid_base);
+  vpx_free(cm->prev_mi_grid_base);
 
-  vpx_free(oci->above_context[0]);
+  vpx_free(cm->above_context[0]);
   for (i = 0; i < MAX_MB_PLANE; i++)
-    oci->above_context[i] = 0;
-  oci->mip = NULL;
-  oci->prev_mip = NULL;
-  oci->above_seg_context = NULL;
+    cm->above_context[i] = 0;
+  cm->mip = NULL;
+  cm->prev_mip = NULL;
+  cm->above_seg_context = NULL;
+  cm->last_frame_seg_map = NULL;
+  cm->mi_grid_base = NULL;
+  cm->prev_mi_grid_base = NULL;
 }
 
 static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
@@ -72,112 +62,125 @@ static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
   cm->mb_rows = (aligned_height + 8) >> 4;
   cm->MBs = cm->mb_rows * cm->mb_cols;
 
-  cm->mi_cols = aligned_width >> LOG2_MI_SIZE;
-  cm->mi_rows = aligned_height >> LOG2_MI_SIZE;
+  cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
   cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE;
 }
 
 static void setup_mi(VP9_COMMON *cm) {
   cm->mi = cm->mip + cm->mode_info_stride + 1;
   cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
 
   vpx_memset(cm->mip, 0,
              cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
 
-  vp9_update_mode_info_border(cm, cm->mip);
-  vp9_update_mode_info_in_image(cm, cm->mi);
+  vpx_memset(cm->mi_grid_base, 0,
+             cm->mode_info_stride * (cm->mi_rows + 1) *
+             sizeof(*cm->mi_grid_base));
 
+  vp9_update_mode_info_border(cm, cm->mip);
   vp9_update_mode_info_border(cm, cm->prev_mip);
-  vp9_update_mode_info_in_image(cm, cm->prev_mi);
 }
 
-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
+int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
   int i, mi_cols;
 
-  const int aligned_width = ALIGN_POWER_OF_TWO(width, LOG2_MI_SIZE);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, LOG2_MI_SIZE);
-  const int ss_x = oci->subsampling_x;
-  const int ss_y = oci->subsampling_y;
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+  const int ss_x = cm->subsampling_x;
+  const int ss_y = cm->subsampling_y;
   int mi_size;
 
-  vp9_free_frame_buffers(oci);
+  vp9_free_frame_buffers(cm);
 
   for (i = 0; i < NUM_YV12_BUFFERS; i++) {
-    oci->fb_idx_ref_cnt[i] = 0;
-    if (vp9_alloc_frame_buffer(&oci->yv12_fb[i], width, height, ss_x, ss_y,
+    cm->fb_idx_ref_cnt[i] = 0;
+    if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y,
                                VP9BORDERINPIXELS) < 0)
       goto fail;
   }
 
-  oci->new_fb_idx = NUM_YV12_BUFFERS - 1;
-  oci->fb_idx_ref_cnt[oci->new_fb_idx] = 1;
+  cm->new_fb_idx = NUM_YV12_BUFFERS - 1;
+  cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1;
 
   for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++)
-    oci->active_ref_idx[i] = i;
+    cm->active_ref_idx[i] = i;
 
   for (i = 0; i < NUM_REF_FRAMES; i++) {
-    oci->ref_frame_map[i] = i;
-    oci->fb_idx_ref_cnt[i] = 1;
+    cm->ref_frame_map[i] = i;
+    cm->fb_idx_ref_cnt[i] = 1;
   }
 
-  if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y,
+  if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
                              VP9BORDERINPIXELS) < 0)
     goto fail;
 
-  set_mb_mi(oci, aligned_width, aligned_height);
+  set_mb_mi(cm, aligned_width, aligned_height);
 
   // Allocation
-  mi_size = oci->mode_info_stride * (oci->mi_rows + MI_BLOCK_SIZE);
+  mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE);
 
-  oci->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
-  if (!oci->mip)
+  cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!cm->mip)
     goto fail;
 
-  oci->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
-  if (!oci->prev_mip)
+  cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!cm->prev_mip)
     goto fail;
 
-  setup_mi(oci);
+  cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
+  if (!cm->mi_grid_base)
+    goto fail;
+
+  cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
+  if (!cm->prev_mi_grid_base)
+    goto fail;
+
+  setup_mi(cm);
 
   // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
   // information is exposed at this level
-  mi_cols = mi_cols_aligned_to_sb(oci->mi_cols);
+  mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
 
   // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
   // block where mi unit size is 8x8.
-# if CONFIG_ALPHA
-  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 8 * mi_cols, 1);
-#else
-  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 6 * mi_cols, 1);
-#endif
-  if (!oci->above_context[0])
+  cm->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE *
+                                         (2 * mi_cols), 1);
+  if (!cm->above_context[0])
     goto fail;
 
-  oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
-  if (!oci->above_seg_context)
+  cm->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
+  if (!cm->above_seg_context)
+    goto fail;
+
+  // Create the segmentation map structure and set to 0.
+  cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
+  if (!cm->last_frame_seg_map)
     goto fail;
 
   return 0;
 
  fail:
-  vp9_free_frame_buffers(oci);
+  vp9_free_frame_buffers(cm);
   return 1;
 }
 
-void vp9_create_common(VP9_COMMON *oci) {
-  vp9_machine_specific_config(oci);
+void vp9_create_common(VP9_COMMON *cm) {
+  vp9_machine_specific_config(cm);
 
-  vp9_init_mbmode_probs(oci);
+  vp9_init_mbmode_probs(cm);
 
-  oci->tx_mode = ONLY_4X4;
-  oci->comp_pred_mode = HYBRID_PREDICTION;
+  cm->tx_mode = ONLY_4X4;
+  cm->comp_pred_mode = HYBRID_PREDICTION;
 
   // Initialize reference frame sign bias structure to defaults
-  vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+  vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
 }
 
-void vp9_remove_common(VP9_COMMON *oci) {
-  vp9_free_frame_buffers(oci);
+void vp9_remove_common(VP9_COMMON *cm) {
+  vp9_free_frame_buffers(cm);
 }
 
 void vp9_initialize_common() {
@@ -188,8 +191,8 @@ void vp9_initialize_common() {
 
 void vp9_update_frame_size(VP9_COMMON *cm) {
   int i, mi_cols;
-  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, LOG2_MI_SIZE);
-  const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, LOG2_MI_SIZE);
+  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, MI_SIZE_LOG2);
 
   set_mb_mi(cm, aligned_width, aligned_height);
   setup_mi(cm);
@@ -198,4 +201,8 @@ void vp9_update_frame_size(VP9_COMMON *cm) {
   for (i = 1; i < MAX_MB_PLANE; i++)
     cm->above_context[i] =
         cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
+
+  // Initialize the previous frame segment map to 0.
+  if (cm->last_frame_seg_map)
+    vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
 }
diff --git a/libvpx/vp9/common/vp9_alloccommon.h b/libvpx/vp9/common/vp9_alloccommon.h
index 8bf5ed1..5d5fae9 100644
--- a/libvpx/vp9/common/vp9_alloccommon.h
+++ b/libvpx/vp9/common/vp9_alloccommon.h
@@ -16,14 +16,13 @@
 
 void vp9_initialize_common();
 
-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi);
-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);
+void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi);
 
-void vp9_create_common(VP9_COMMON *oci);
-void vp9_remove_common(VP9_COMMON *oci);
+void vp9_create_common(VP9_COMMON *cm);
+void vp9_remove_common(VP9_COMMON *cm);
 
-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);
-void vp9_free_frame_buffers(VP9_COMMON *oci);
+int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height);
+void vp9_free_frame_buffers(VP9_COMMON *cm);
 
 
 void vp9_update_frame_size(VP9_COMMON *cm);
diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h
index f68c5c6..c8d677f 100644
--- a/libvpx/vp9/common/vp9_blockd.h
+++ b/libvpx/vp9/common/vp9_blockd.h
@@ -19,9 +19,9 @@
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_common_data.h"
-#include "vp9/common/vp9_convolve.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_mv.h"
+#include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_treecoder.h"
 
@@ -56,11 +56,11 @@ typedef enum {
 } FRAME_TYPE;
 
 typedef enum {
-  EIGHTTAP_SMOOTH,
-  EIGHTTAP,
-  EIGHTTAP_SHARP,
-  BILINEAR,
-  SWITCHABLE  /* should be the last one */
+  EIGHTTAP = 0,
+  EIGHTTAP_SMOOTH = 1,
+  EIGHTTAP_SHARP = 2,
+  BILINEAR = 3,
+  SWITCHABLE = 4  /* should be the last one */
 } INTERPOLATIONFILTERTYPE;
 
 typedef enum {
@@ -71,7 +71,7 @@ typedef enum {
   D135_PRED,       // Directional 135 deg = 180 - 45
   D117_PRED,       // Directional 117 deg = 180 - 63
   D153_PRED,       // Directional 153 deg = 180 - 27
-  D27_PRED,        // Directional 27  deg = round(arctan(1/2) * 180/pi)
+  D207_PRED,       // Directional 207 deg = 180 + 27
   D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
   TM_PRED,         // True-motion
   NEARESTMV,
@@ -89,9 +89,9 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
   return mode >= NEARESTMV && mode <= NEWMV;
 }
 
-#define VP9_INTRA_MODES (TM_PRED + 1)
+#define INTRA_MODES (TM_PRED + 1)
 
-#define VP9_INTER_MODES (1 + NEWMV - NEARESTMV)
+#define INTER_MODES (1 + NEWMV - NEARESTMV)
 
 static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
   return (mode - NEARESTMV);
@@ -115,45 +115,41 @@ typedef enum {
   MAX_REF_FRAMES = 4
 } MV_REFERENCE_FRAME;
 
-static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int b_width_log2(BLOCK_SIZE sb_type) {
   return b_width_log2_lookup[sb_type];
 }
-static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int b_height_log2(BLOCK_SIZE sb_type) {
   return b_height_log2_lookup[sb_type];
 }
 
-static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
   return mi_width_log2_lookup[sb_type];
 }
 
-static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int mi_height_log2(BLOCK_SIZE sb_type) {
   return mi_height_log2_lookup[sb_type];
 }
 
+// This structure now relates to 8x8 block regions.
 typedef struct {
   MB_PREDICTION_MODE mode, uv_mode;
   MV_REFERENCE_FRAME ref_frame[2];
-  TX_SIZE txfm_size;
-  int_mv mv[2]; // for each reference frame used
+  TX_SIZE tx_size;
+  int_mv mv[2];                // for each reference frame used
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   int_mv best_mv, best_second_mv;
 
-  uint8_t mb_mode_context[MAX_REF_FRAMES];
+  uint8_t mode_context[MAX_REF_FRAMES];
 
-  unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
-  unsigned char segment_id;           // Segment id for current frame
+  unsigned char skip_coeff;    // 0=need to decode coeffs, 1=no coefficients
+  unsigned char segment_id;    // Segment id for this block.
 
-  // Flags used for prediction status of various bistream signals
+  // Flags used for prediction status of various bit-stream signals
   unsigned char seg_id_predicted;
 
-  // Indicates if the mb is part of the image (1) vs border (0)
-  // This can be useful in determining whether the MB provides
-  // a valid predictor
-  unsigned char mb_in_image;
-
   INTERPOLATIONFILTERTYPE interp_filter;
 
-  BLOCK_SIZE_TYPE sb_type;
+  BLOCK_SIZE sb_type;
 } MB_MODE_INFO;
 
 typedef struct {
@@ -161,36 +157,19 @@ typedef struct {
   union b_mode_info bmi[4];
 } MODE_INFO;
 
-static int is_inter_block(const MB_MODE_INFO *mbmi) {
+static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
   return mbmi->ref_frame[0] > INTRA_FRAME;
 }
 
+static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
+  return mbmi->ref_frame[1] > INTRA_FRAME;
+}
 
 enum mv_precision {
   MV_PRECISION_Q3,
   MV_PRECISION_Q4
 };
 
-#define VP9_REF_SCALE_SHIFT 14
-#define VP9_REF_NO_SCALE (1 << VP9_REF_SCALE_SHIFT)
-
-struct scale_factors {
-  int x_scale_fp;   // horizontal fixed point scale factor
-  int y_scale_fp;   // vertical fixed point scale factor
-  int x_offset_q4;
-  int x_step_q4;
-  int y_offset_q4;
-  int y_step_q4;
-
-  int (*scale_value_x)(int val, const struct scale_factors *scale);
-  int (*scale_value_y)(int val, const struct scale_factors *scale);
-  void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
-  MV32 (*scale_mv_q3_to_q4)(const MV *mv, const struct scale_factors *scale);
-  MV32 (*scale_mv_q4)(const MV *mv, const struct scale_factors *scale);
-
-  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
-};
-
 #if CONFIG_ALPHA
 enum { MAX_MB_PLANE = 4 };
 #else
@@ -216,45 +195,27 @@ struct macroblockd_plane {
   ENTROPY_CONTEXT *left_context;
 };
 
-#define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n))
-
-#define MAX_REF_LF_DELTAS       4
-#define MAX_MODE_LF_DELTAS      2
-
-struct loopfilter {
-  int filter_level;
-
-  int sharpness_level;
-  int last_sharpness_level;
-
-  uint8_t mode_ref_delta_enabled;
-  uint8_t mode_ref_delta_update;
-
-  // 0 = Intra, Last, GF, ARF
-  signed char ref_deltas[MAX_REF_LF_DELTAS];
-  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
-
-  // 0 = ZERO_MV, MV
-  signed char mode_deltas[MAX_MODE_LF_DELTAS];
-  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
-};
+#define BLOCK_OFFSET(x, i) ((x) + (i) * 16)
 
 typedef struct macroblockd {
   struct macroblockd_plane plane[MAX_MB_PLANE];
 
   struct scale_factors scale_factor[2];
 
-  MODE_INFO *prev_mode_info_context;
-  MODE_INFO *mode_info_context;
+  MODE_INFO *last_mi;
+  MODE_INFO *this_mi;
   int mode_info_stride;
 
+  MODE_INFO *mic_stream_ptr;
+
+  // A NULL indicates that the 8x8 is not part of the image
+  MODE_INFO **mi_8x8;
+  MODE_INFO **prev_mi_8x8;
+
   int up_available;
   int left_available;
   int right_available;
 
-  struct segmentation seg;
-  struct loopfilter lf;
-
   // partition contexts
   PARTITION_CONTEXT *above_seg_context;
   PARTITION_CONTEXT *left_seg_context;
@@ -286,7 +247,7 @@ typedef struct macroblockd {
 
 } MACROBLOCKD;
 
-static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
   switch (subsize) {
     case BLOCK_64X64:
     case BLOCK_64X32:
@@ -311,9 +272,8 @@ static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsi
   }
 }
 
-static INLINE void update_partition_context(MACROBLOCKD *xd,
-                                            BLOCK_SIZE_TYPE sb_type,
-                                            BLOCK_SIZE_TYPE sb_size) {
+static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type,
+                                            BLOCK_SIZE sb_size) {
   const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
   const int bwl = b_width_log2(sb_type);
   const int bhl = b_height_log2(sb_type);
@@ -331,8 +291,7 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
   vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
 }
 
-static INLINE int partition_plane_context(MACROBLOCKD *xd,
-                                          BLOCK_SIZE_TYPE sb_type) {
+static INLINE int partition_plane_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type) {
   int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
   int above = 0, left = 0, i;
   int boffset = mi_width_log2(BLOCK_64X64) - bsl;
@@ -352,10 +311,9 @@ static INLINE int partition_plane_context(MACROBLOCKD *xd,
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
-static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
-                                   PARTITION_TYPE partition) {
-  BLOCK_SIZE_TYPE subsize = subsize_lookup[partition][bsize];
-  assert(subsize != BLOCK_SIZE_TYPES);
+static BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) {
+  const BLOCK_SIZE subsize = subsize_lookup[partition][bsize];
+  assert(subsize < BLOCK_SIZES);
   return subsize;
 }
 
@@ -363,7 +321,7 @@ extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];
 
 static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                       const MACROBLOCKD *xd, int ib) {
-  const MODE_INFO *const mi = xd->mode_info_context;
+  const MODE_INFO *const mi = xd->this_mi;
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (plane_type != PLANE_TYPE_Y_WITH_DC ||
@@ -378,13 +336,13 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
 static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
                                       const MACROBLOCKD *xd) {
   return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
+             mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT;
 }
 
 static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type,
                                         const MACROBLOCKD *xd) {
   return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
+             mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT;
 }
 
 static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {
@@ -404,259 +362,147 @@ static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {
 
 
 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
-  return MIN(mbmi->txfm_size, max_uv_txsize_lookup[mbmi->sb_type]);
+  return MIN(mbmi->tx_size, max_uv_txsize_lookup[mbmi->sb_type]);
 }
 
-struct plane_block_idx {
-  int plane;
-  int block;
-};
-
-// TODO(jkoleszar): returning a struct so it can be used in a const context,
-// expect to refactor this further later.
-static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
-                                                     int b_idx) {
-  const int v_offset = y_blocks * 5 / 4;
-  struct plane_block_idx res;
-
-  if (b_idx < y_blocks) {
-    res.plane = 0;
-    res.block = b_idx;
-  } else if (b_idx < v_offset) {
-    res.plane = 1;
-    res.block = b_idx - y_blocks;
-  } else {
-    assert(b_idx < y_blocks * 3 / 2);
-    res.plane = 2;
-    res.block = b_idx - v_offset;
-  }
-  return res;
+static BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+                                       const struct macroblockd_plane *pd) {
+  BLOCK_SIZE bs = ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
+  assert(bs < BLOCK_SIZES);
+  return bs;
 }
 
-static INLINE int plane_block_width(BLOCK_SIZE_TYPE bsize,
+static INLINE int plane_block_width(BLOCK_SIZE bsize,
                                     const struct macroblockd_plane* plane) {
   return 4 << (b_width_log2(bsize) - plane->subsampling_x);
 }
 
-static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize,
+static INLINE int plane_block_height(BLOCK_SIZE bsize,
                                      const struct macroblockd_plane* plane) {
   return 4 << (b_height_log2(bsize) - plane->subsampling_y);
 }
 
-static INLINE int plane_block_width_log2by4(
-    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
-  return (b_width_log2(bsize) - plane->subsampling_x);
-}
-
-static INLINE int plane_block_height_log2by4(
-    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
-  return (b_height_log2(bsize) - plane->subsampling_y);
-}
-
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
-                                                  BLOCK_SIZE_TYPE bsize,
-                                                  int ss_txfrm_size,
+                                                  BLOCK_SIZE plane_bsize,
+                                                  TX_SIZE tx_size,
                                                   void *arg);
 
 static INLINE void foreach_transformed_block_in_plane(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
     foreach_transformed_block_visitor visit, void *arg) {
-  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
-
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO* mbmi = &xd->this_mi->mbmi;
   // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
   // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
   // transform size varies per plane, look it up in a common way.
-  const MB_MODE_INFO* mbmi = &xd->mode_info_context->mbmi;
   const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)
-                                : mbmi->txfm_size;
-  const int block_size_b = bw + bh;
-  const int txfrm_size_b = tx_size * 2;
-
-  // subsampled size of the block
-  const int ss_sum = xd->plane[plane].subsampling_x
-      + xd->plane[plane].subsampling_y;
-  const int ss_block_size = block_size_b - ss_sum;
-
-  const int step = 1 << txfrm_size_b;
-
+                                : mbmi->tx_size;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int step = 1 << (tx_size << 1);
   int i;
 
-  assert(txfrm_size_b <= block_size_b);
-  assert(txfrm_size_b <= ss_block_size);
-
   // If mb_to_right_edge is < 0 we are in a situation in which
   // the current block size extends into the UMV and we won't
   // visit the sub blocks that are wholly within the UMV.
   if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
     int r, c;
-    const int sw = bw - xd->plane[plane].subsampling_x;
-    const int sh = bh - xd->plane[plane].subsampling_y;
-    int max_blocks_wide = 1 << sw;
-    int max_blocks_high = 1 << sh;
+
+    int max_blocks_wide = num_4x4_w;
+    int max_blocks_high = num_4x4_h;
 
     // xd->mb_to_right_edge is in units of pixels * 8.  This converts
     // it to 4x4 block sizes.
     if (xd->mb_to_right_edge < 0)
-      max_blocks_wide +=
-          (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));
+      max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
 
     if (xd->mb_to_bottom_edge < 0)
-      max_blocks_high +=
-          (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));
+      max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
     i = 0;
     // Unlike the normal case - in here we have to keep track of the
     // row and column of the blocks we use so that we know if we are in
     // the unrestricted motion border.
-    for (r = 0; r < (1 << sh); r += (1 << tx_size)) {
-      for (c = 0; c < (1 << sw); c += (1 << tx_size)) {
+    for (r = 0; r < num_4x4_h; r += (1 << tx_size)) {
+      for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
         if (r < max_blocks_high && c < max_blocks_wide)
-          visit(plane, i, bsize, txfrm_size_b, arg);
+          visit(plane, i, plane_bsize, tx_size, arg);
         i += step;
       }
     }
   } else {
-    for (i = 0; i < (1 << ss_block_size); i += step) {
-      visit(plane, i, bsize, txfrm_size_b, arg);
-    }
+    for (i = 0; i < num_4x4_w * num_4x4_h; i += step)
+      visit(plane, i, plane_bsize, tx_size, arg);
   }
 }
 
 static INLINE void foreach_transformed_block(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
     foreach_transformed_block_visitor visit, void *arg) {
   int plane;
 
-  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
-    foreach_transformed_block_in_plane(xd, bsize, plane,
-                                       visit, arg);
-  }
+  for (plane = 0; plane < MAX_MB_PLANE; plane++)
+    foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }
 
 static INLINE void foreach_transformed_block_uv(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
     foreach_transformed_block_visitor visit, void *arg) {
   int plane;
 
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    foreach_transformed_block_in_plane(xd, bsize, plane,
-                                       visit, arg);
-  }
+  for (plane = 1; plane < MAX_MB_PLANE; plane++)
+    foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }
 
-// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE_TYPE, but that type isn't defined for
-// sizes smaller than 16x16 yet.
-typedef void (*foreach_predicted_block_visitor)(int plane, int block,
-                                                BLOCK_SIZE_TYPE bsize,
-                                                int pred_w, int pred_h,
-                                                void *arg);
-static INLINE void foreach_predicted_block_in_plane(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
-    foreach_predicted_block_visitor visit, void *arg) {
-  int i, x, y;
-
-  // block sizes in number of 4x4 blocks log 2 ("*_b")
-  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
-  // subsampled size of the block
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-
-  // size of the predictor to use.
-  int pred_w, pred_h;
-
-  if (xd->mode_info_context->mbmi.sb_type < BLOCK_8X8) {
-    assert(bsize == BLOCK_8X8);
-    pred_w = 0;
-    pred_h = 0;
-  } else {
-    pred_w = bwl;
-    pred_h = bhl;
-  }
-  assert(pred_w <= bwl);
-  assert(pred_h <= bhl);
-
-  // visit each subblock in raster order
-  i = 0;
-  for (y = 0; y < 1 << bhl; y += 1 << pred_h) {
-    for (x = 0; x < 1 << bwl; x += 1 << pred_w) {
-      visit(plane, i, bsize, pred_w, pred_h, arg);
-      i += 1 << pred_w;
-    }
-    i += (1 << (bwl + pred_h)) - (1 << bwl);
-  }
-}
-static INLINE void foreach_predicted_block(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
-    foreach_predicted_block_visitor visit, void *arg) {
-  int plane;
-
-  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
-    foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);
-  }
+static int raster_block_offset(BLOCK_SIZE plane_bsize,
+                               int raster_block, int stride) {
+  const int bw = b_width_log2(plane_bsize);
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
+  return y * stride + x;
 }
-static INLINE void foreach_predicted_block_uv(
-    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
-    foreach_predicted_block_visitor visit, void *arg) {
-  int plane;
-
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);
-  }
+static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                          int raster_block, int16_t *base) {
+  const int stride = 4 << b_width_log2(plane_bsize);
+  return base + raster_block_offset(plane_bsize, raster_block, stride);
 }
-static int raster_block_offset(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
-                               int plane, int block, int stride) {
-  const int bw = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int y = 4 * (block >> bw), x = 4 * (block & ((1 << bw) - 1));
-  return y * stride + x;
+static uint8_t* raster_block_offset_uint8(BLOCK_SIZE plane_bsize,
+                                          int raster_block, uint8_t *base,
+                                          int stride) {
+  return base + raster_block_offset(plane_bsize, raster_block, stride);
 }
-static int16_t* raster_block_offset_int16(MACROBLOCKD *xd,
-                                         BLOCK_SIZE_TYPE bsize,
-                                         int plane, int block, int16_t *base) {
-  const int stride = plane_block_width(bsize, &xd->plane[plane]);
-  return base + raster_block_offset(xd, bsize, plane, block, stride);
-}
-static uint8_t* raster_block_offset_uint8(MACROBLOCKD *xd,
-                                         BLOCK_SIZE_TYPE bsize,
-                                         int plane, int block,
-                                         uint8_t *base, int stride) {
-  return base + raster_block_offset(xd, bsize, plane, block, stride);
-}
-
-static int txfrm_block_to_raster_block(MACROBLOCKD *xd,
-                                       BLOCK_SIZE_TYPE bsize,
-                                       int plane, int block,
-                                       int ss_txfrm_size) {
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int txwl = ss_txfrm_size / 2;
-  const int tx_cols_log2 = bwl - txwl;
+
+static int txfrm_block_to_raster_block(BLOCK_SIZE plane_bsize,
+                                       TX_SIZE tx_size, int block) {
+  const int bwl = b_width_log2(plane_bsize);
+  const int tx_cols_log2 = bwl - tx_size;
   const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> ss_txfrm_size;
-  const int x = (raster_mb & (tx_cols - 1)) << (txwl);
-  const int y = raster_mb >> tx_cols_log2 << (txwl);
+  const int raster_mb = block >> (tx_size << 1);
+  const int x = (raster_mb & (tx_cols - 1)) << tx_size;
+  const int y = (raster_mb >> tx_cols_log2) << tx_size;
   return x + (y << bwl);
 }
 
-static void txfrm_block_to_raster_xy(MACROBLOCKD *xd,
-                                     BLOCK_SIZE_TYPE bsize,
-                                     int plane, int block,
-                                     int ss_txfrm_size,
+static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
+                                     TX_SIZE tx_size, int block,
                                      int *x, int *y) {
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int txwl = ss_txfrm_size / 2;
-  const int tx_cols_log2 = bwl - txwl;
+  const int bwl = b_width_log2(plane_bsize);
+  const int tx_cols_log2 = bwl - tx_size;
   const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> ss_txfrm_size;
-  *x = (raster_mb & (tx_cols - 1)) << (txwl);
-  *y = raster_mb >> tx_cols_log2 << (txwl);
+  const int raster_mb = block >> (tx_size << 1);
+  *x = (raster_mb & (tx_cols - 1)) << tx_size;
+  *y = (raster_mb >> tx_cols_log2) << tx_size;
 }
 
-static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
-                             BLOCK_SIZE_TYPE bsize, int ss_txfrm_size) {
-  const int bw = plane_block_width(bsize, &xd->plane[plane]);
-  const int bh = plane_block_height(bsize, &xd->plane[plane]);
+static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
+                             int plane, int block, TX_SIZE tx_size) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  uint8_t *const buf = pd->dst.buf;
+  const int stride = pd->dst.stride;
+
   int x, y;
-  txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
   x = x * 4 - 1;
   y = y * 4 - 1;
   // Copy a pixel into the umv if we are in a situation where the block size
@@ -664,41 +510,38 @@ static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
   // TODO(JBB): Should be able to do the full extend in place so we don't have
   // to do this multiple times.
   if (xd->mb_to_right_edge < 0) {
-    int umv_border_start = bw
-        + (xd->mb_to_right_edge >> (3 + xd->plane[plane].subsampling_x));
+    const int bw = 4 << b_width_log2(plane_bsize);
+    const int umv_border_start = bw + (xd->mb_to_right_edge >>
+                                       (3 + pd->subsampling_x));
 
     if (x + bw > umv_border_start)
-      vpx_memset(
-          xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride
-              + umv_border_start,
-          *(xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride
-              + umv_border_start - 1),
-          bw);
+      vpx_memset(&buf[y * stride + umv_border_start],
+                 buf[y * stride + umv_border_start - 1], bw);
   }
+
   if (xd->mb_to_bottom_edge < 0) {
-    int umv_border_start = bh
-        + (xd->mb_to_bottom_edge >> (3 + xd->plane[plane].subsampling_y));
+    const int bh = 4 << b_height_log2(plane_bsize);
+    const int umv_border_start = bh + (xd->mb_to_bottom_edge >>
+                                       (3 + pd->subsampling_y));
     int i;
-    uint8_t c = *(xd->plane[plane].dst.buf
-        + (umv_border_start - 1) * xd->plane[plane].dst.stride + x);
-
-    uint8_t *d = xd->plane[plane].dst.buf
-        + umv_border_start * xd->plane[plane].dst.stride + x;
+    const uint8_t c = buf[(umv_border_start - 1) * stride + x];
+    uint8_t *d = &buf[umv_border_start * stride + x];
 
     if (y + bh > umv_border_start)
-      for (i = 0; i < bh; i++, d += xd->plane[plane].dst.stride)
+      for (i = 0; i < bh; ++i, d += stride)
         *d = c;
   }
 }
-static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
-                                   int plane, int tx_size_in_blocks,
-                                   int eob, int aoff, int loff,
+static void set_contexts_on_border(MACROBLOCKD *xd,
+                                   struct macroblockd_plane *pd,
+                                   BLOCK_SIZE plane_bsize,
+                                   int tx_size_in_blocks, int has_eob,
+                                   int aoff, int loff,
                                    ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
-  struct macroblockd_plane *pd = &xd->plane[plane];
+  int mi_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  int mi_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int above_contexts = tx_size_in_blocks;
   int left_contexts = tx_size_in_blocks;
-  int mi_blocks_wide = 1 << plane_block_width_log2by4(bsize, pd);
-  int mi_blocks_high = 1 << plane_block_height_log2by4(bsize, pd);
   int pt;
 
   // xd->mb_to_right_edge is in units of pixels * 8.  This converts
@@ -706,26 +549,47 @@ static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
   if (xd->mb_to_right_edge < 0)
     mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
 
+  if (xd->mb_to_bottom_edge < 0)
+    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
   // this code attempts to avoid copying into contexts that are outside
   // our border.  Any blocks that do are set to 0...
   if (above_contexts + aoff > mi_blocks_wide)
     above_contexts = mi_blocks_wide - aoff;
 
-  if (xd->mb_to_bottom_edge < 0)
-    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
   if (left_contexts + loff > mi_blocks_high)
     left_contexts = mi_blocks_high - loff;
 
   for (pt = 0; pt < above_contexts; pt++)
-    A[pt] = eob > 0;
+    A[pt] = has_eob;
   for (pt = above_contexts; pt < tx_size_in_blocks; pt++)
     A[pt] = 0;
   for (pt = 0; pt < left_contexts; pt++)
-    L[pt] = eob > 0;
+    L[pt] = has_eob;
   for (pt = left_contexts; pt < tx_size_in_blocks; pt++)
     L[pt] = 0;
 }
 
+static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                         int has_eob, int aoff, int loff) {
+  ENTROPY_CONTEXT *const A = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const L = pd->left_context + loff;
+  const int tx_size_in_blocks = 1 << tx_size;
+
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    set_contexts_on_border(xd, pd, plane_bsize, tx_size_in_blocks, has_eob,
+                           aoff, loff, A, L);
+  } else {
+    vpx_memset(A, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    vpx_memset(L, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+}
+
+static int get_tx_eob(struct segmentation *seg, int segment_id,
+                      TX_SIZE tx_size) {
+  const int eob_max = 16 << (tx_size << 1);
+  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
 
 #endif  // VP9_COMMON_VP9_BLOCKD_H_
diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c
index fdf37e4..dc41efd 100644
--- a/libvpx/vp9/common/vp9_common_data.c
+++ b/libvpx/vp9/common/vp9_common_data.c
@@ -13,33 +13,33 @@
 #include "vp9/common/vp9_common_data.h"
 
 // Log 2 conversion lookup tables for block width and height
-const int b_width_log2_lookup[BLOCK_SIZE_TYPES] =
+const int b_width_log2_lookup[BLOCK_SIZES] =
   {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
-const int b_height_log2_lookup[BLOCK_SIZE_TYPES] =
+const int b_height_log2_lookup[BLOCK_SIZES] =
   {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
-const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
+const int num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
   {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
-const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES] =
+const int num_4x4_blocks_high_lookup[BLOCK_SIZES] =
   {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
 // Log 2 conversion lookup tables for modeinfo width and height
-const int mi_width_log2_lookup[BLOCK_SIZE_TYPES] =
+const int mi_width_log2_lookup[BLOCK_SIZES] =
   {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
-const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
+const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
-const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] =
+const int mi_height_log2_lookup[BLOCK_SIZES] =
   {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
-const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] =
+const int num_8x8_blocks_high_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
 
 // MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize)))
-const int size_group_lookup[BLOCK_SIZE_TYPES] =
+const int size_group_lookup[BLOCK_SIZES] =
   {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};
 
-const int num_pels_log2_lookup[BLOCK_SIZE_TYPES] =
+const int num_pels_log2_lookup[BLOCK_SIZES] =
   {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
 
 
-const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
+const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
   {  // 4X4
     // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
     PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID,
@@ -74,51 +74,62 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
   }
 };
 
-const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = {
+const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
   {     // PARTITION_NONE
-    BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
-    BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
+    BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
+    BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
     BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
     BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
     BLOCK_64X64,
   }, {  // PARTITION_HORZ
-    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
     BLOCK_64X32,
   }, {  // PARTITION_VERT
-    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
     BLOCK_32X64,
   }, {  // PARTITION_SPLIT
-    BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X16,   BLOCK_INVALID, BLOCK_INVALID,
     BLOCK_32X32,
   }
 };
 
-const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES] = {
-  TX_4X4, TX_4X4, TX_4X4,
-  TX_8X8, TX_8X8, TX_8X8,
+const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
+  TX_4X4,   TX_4X4,   TX_4X4,
+  TX_8X8,   TX_8X8,   TX_8X8,
   TX_16X16, TX_16X16, TX_16X16,
   TX_32X32, TX_32X32, TX_32X32, TX_32X32
 };
-const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES] = {
-  TX_4X4, TX_4X4, TX_4X4,
-  TX_4X4, TX_4X4, TX_4X4,
-  TX_8X8, TX_8X8, TX_8X8,
+const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES] = {
+  TX_4X4,   TX_4X4,   TX_4X4,
+  TX_4X4,   TX_4X4,   TX_4X4,
+  TX_8X8,   TX_8X8,   TX_8X8,
   TX_16X16, TX_16X16, TX_16X16, TX_32X32
 };
 
-const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5] = {
-  { BLOCK_4X4,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8 },
-  { BLOCK_8X4,   BLOCK_8X8,   BLOCK_8X16,  BLOCK_8X16,  BLOCK_8X16 },
-  { BLOCK_16X8,  BLOCK_16X8,  BLOCK_16X16, BLOCK_16X32, BLOCK_16X32 },
-  { BLOCK_32X16, BLOCK_32X16, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64 },
-  { BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X64 }
+const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
+//  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
+//  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
+  {{BLOCK_4X4,   BLOCK_INVALID}, {BLOCK_INVALID, BLOCK_INVALID}},
+  {{BLOCK_4X8,   BLOCK_4X4},     {BLOCK_INVALID, BLOCK_INVALID}},
+  {{BLOCK_8X4,   BLOCK_INVALID}, {BLOCK_4X4,     BLOCK_INVALID}},
+  {{BLOCK_8X8,   BLOCK_8X4},     {BLOCK_4X8,     BLOCK_4X4}},
+  {{BLOCK_8X16,  BLOCK_8X8},     {BLOCK_INVALID, BLOCK_4X8}},
+  {{BLOCK_16X8,  BLOCK_INVALID}, {BLOCK_8X8,     BLOCK_8X4}},
+  {{BLOCK_16X16, BLOCK_16X8},    {BLOCK_8X16,    BLOCK_8X8}},
+  {{BLOCK_16X32, BLOCK_16X16},   {BLOCK_INVALID, BLOCK_8X16}},
+  {{BLOCK_32X16, BLOCK_INVALID}, {BLOCK_16X16,   BLOCK_16X8}},
+  {{BLOCK_32X32, BLOCK_32X16},   {BLOCK_16X32,   BLOCK_16X16}},
+  {{BLOCK_32X64, BLOCK_32X32},   {BLOCK_INVALID, BLOCK_16X32}},
+  {{BLOCK_64X32, BLOCK_INVALID}, {BLOCK_32X32,   BLOCK_32X16}},
+  {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
 };
+
diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h
index bc8c01a..3822bfc 100644
--- a/libvpx/vp9/common/vp9_common_data.h
+++ b/libvpx/vp9/common/vp9_common_data.h
@@ -13,20 +13,20 @@
 
 #include "vp9/common/vp9_enums.h"
 
-extern const int b_width_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int b_height_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int mi_width_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int mi_height_log2_lookup[BLOCK_SIZE_TYPES];
-extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES];
-extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES];
-extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES];
-extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES];
-extern const int size_group_lookup[BLOCK_SIZE_TYPES];
-extern const int num_pels_log2_lookup[BLOCK_SIZE_TYPES];
-extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES];
-extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES];
-extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES];
-extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES];
-extern const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5];
+extern const int b_width_log2_lookup[BLOCK_SIZES];
+extern const int b_height_log2_lookup[BLOCK_SIZES];
+extern const int mi_width_log2_lookup[BLOCK_SIZES];
+extern const int mi_height_log2_lookup[BLOCK_SIZES];
+extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES];
+extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES];
+extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES];
+extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZES];
+extern const int size_group_lookup[BLOCK_SIZES];
+extern const int num_pels_log2_lookup[BLOCK_SIZES];
+extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
+extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
+extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
+extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES];
+extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
 
 #endif    // VP9_COMMON_VP9_COMMON_DATA_H
diff --git a/libvpx/vp9/common/vp9_convolve.c b/libvpx/vp9/common/vp9_convolve.c
index 6f1e418..94231a1 100644
--- a/libvpx/vp9/common/vp9_convolve.c
+++ b/libvpx/vp9/common/vp9_convolve.c
@@ -14,66 +14,45 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
-#define VP9_FILTER_WEIGHT 128
-#define VP9_FILTER_SHIFT  7
-
-/* Assume a bank of 16 filters to choose from. There are two implementations
- * for filter wrapping behavior, since we want to be able to pick which filter
- * to start with. We could either:
- *
- * 1) make filter_ a pointer to the base of the filter array, and then add an
- *    additional offset parameter, to choose the starting filter.
- * 2) use a pointer to 2 periods worth of filters, so that even if the original
- *    phase offset is at 15/16, we'll have valid data to read. The filter
- *    tables become [32][8], and the second half is duplicated.
- * 3) fix the alignment of the filter tables, so that we know the 0/16 is
- *    always 256 byte aligned.
- *
- * Implementations 2 and 3 are likely preferable, as they avoid an extra 2
- * parameters, and switching between them is trivial, with the
- * ALIGN_FILTERS_256 macro, below.
- */
- #define ALIGN_FILTERS_256 1
-
 static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x0, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h, int taps) {
-  int x, y, k, sum;
-  const int16_t *filter_x_base = filter_x0;
+  int x, y, k;
 
-#if ALIGN_FILTERS_256
-  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_x_base =
+      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
 
   /* Adjust base pointer address for this source line */
   src -= taps / 2 - 1;
 
   for (y = 0; y < h; ++y) {
-    /* Pointer to filter to use */
-    const int16_t *filter_x = filter_x0;
-
     /* Initial phase offset */
-    int x0_q4 = (filter_x - filter_x_base) / taps;
-    int x_q4 = x0_q4;
+    int x_q4 = (filter_x0 - filter_x_base) / taps;
 
     for (x = 0; x < w; ++x) {
       /* Per-pixel src offset */
-      int src_x = (x_q4 - x0_q4) >> 4;
+      const int src_x = x_q4 >> SUBPEL_BITS;
+      int sum = 0;
 
-      for (sum = 0, k = 0; k < taps; ++k) {
+      /* Pointer to filter to use */
+      const int16_t *const filter_x = filter_x_base +
+          (x_q4 & SUBPEL_MASK) * taps;
+
+      for (k = 0; k < taps; ++k)
         sum += src[src_x + k] * filter_x[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[x] = clip_pixel(sum >> VP9_FILTER_SHIFT);
 
-      /* Adjust source and filter to use for the next pixel */
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+
+      /* Move to the next source pixel */
       x_q4 += x_step_q4;
-      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
     }
     src += src_stride;
     dst += dst_stride;
@@ -85,37 +64,37 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                  const int16_t *filter_x0, int x_step_q4,
                                  const int16_t *filter_y, int y_step_q4,
                                  int w, int h, int taps) {
-  int x, y, k, sum;
-  const int16_t *filter_x_base = filter_x0;
+  int x, y, k;
 
-#if ALIGN_FILTERS_256
-  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_x_base =
+      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
 
   /* Adjust base pointer address for this source line */
   src -= taps / 2 - 1;
 
   for (y = 0; y < h; ++y) {
-    /* Pointer to filter to use */
-    const int16_t *filter_x = filter_x0;
-
     /* Initial phase offset */
-    int x0_q4 = (filter_x - filter_x_base) / taps;
-    int x_q4 = x0_q4;
+    int x_q4 = (filter_x0 - filter_x_base) / taps;
 
     for (x = 0; x < w; ++x) {
       /* Per-pixel src offset */
-      int src_x = (x_q4 - x0_q4) >> 4;
+      const int src_x = x_q4 >> SUBPEL_BITS;
+      int sum = 0;
 
-      for (sum = 0, k = 0; k < taps; ++k) {
+      /* Pointer to filter to use */
+      const int16_t *const filter_x = filter_x_base +
+          (x_q4 & SUBPEL_MASK) * taps;
+
+      for (k = 0; k < taps; ++k)
         sum += src[src_x + k] * filter_x[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[x] = (dst[x] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;
 
-      /* Adjust source and filter to use for the next pixel */
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
+                   clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+
+      /* Move to the next source pixel */
       x_q4 += x_step_q4;
-      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
     }
     src += src_stride;
     dst += dst_stride;
@@ -127,37 +106,37 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                             const int16_t *filter_x, int x_step_q4,
                             const int16_t *filter_y0, int y_step_q4,
                             int w, int h, int taps) {
-  int x, y, k, sum;
+  int x, y, k;
 
-  const int16_t *filter_y_base = filter_y0;
-
-#if ALIGN_FILTERS_256
-  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_y_base =
+      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
 
   /* Adjust base pointer address for this source column */
   src -= src_stride * (taps / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    /* Pointer to filter to use */
-    const int16_t *filter_y = filter_y0;
 
+  for (x = 0; x < w; ++x) {
     /* Initial phase offset */
-    int y0_q4 = (filter_y - filter_y_base) / taps;
-    int y_q4 = y0_q4;
+    int y_q4 = (filter_y0 - filter_y_base) / taps;
 
     for (y = 0; y < h; ++y) {
       /* Per-pixel src offset */
-      int src_y = (y_q4 - y0_q4) >> 4;
+      const int src_y = y_q4 >> SUBPEL_BITS;
+      int sum = 0;
 
-      for (sum = 0, k = 0; k < taps; ++k) {
+      /* Pointer to filter to use */
+      const int16_t *const filter_y = filter_y_base +
+          (y_q4 & SUBPEL_MASK) * taps;
+
+      for (k = 0; k < taps; ++k)
         sum += src[(src_y + k) * src_stride] * filter_y[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[y * dst_stride] = clip_pixel(sum >> VP9_FILTER_SHIFT);
 
-      /* Adjust source and filter to use for the next pixel */
+      dst[y * dst_stride] =
+          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+
+      /* Move to the next source pixel */
       y_q4 += y_step_q4;
-      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
     }
     ++src;
     ++dst;
@@ -169,38 +148,37 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                 const int16_t *filter_x, int x_step_q4,
                                 const int16_t *filter_y0, int y_step_q4,
                                 int w, int h, int taps) {
-  int x, y, k, sum;
-
-  const int16_t *filter_y_base = filter_y0;
+  int x, y, k;
 
-#if ALIGN_FILTERS_256
-  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_y_base =
+      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
 
   /* Adjust base pointer address for this source column */
   src -= src_stride * (taps / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    /* Pointer to filter to use */
-    const int16_t *filter_y = filter_y0;
 
+  for (x = 0; x < w; ++x) {
     /* Initial phase offset */
-    int y0_q4 = (filter_y - filter_y_base) / taps;
-    int y_q4 = y0_q4;
+    int y_q4 = (filter_y0 - filter_y_base) / taps;
 
     for (y = 0; y < h; ++y) {
       /* Per-pixel src offset */
-      int src_y = (y_q4 - y0_q4) >> 4;
+      const int src_y = y_q4 >> SUBPEL_BITS;
+      int sum = 0;
+
+      /* Pointer to filter to use */
+      const int16_t *const filter_y = filter_y_base +
+          (y_q4 & SUBPEL_MASK) * taps;
 
-      for (sum = 0, k = 0; k < taps; ++k) {
+      for (k = 0; k < taps; ++k)
         sum += src[(src_y + k) * src_stride] * filter_y[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[y * dst_stride] =
-          (dst[y * dst_stride] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;
 
-      /* Adjust source and filter to use for the next pixel */
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
+           clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+
+      /* Move to the next source pixel */
       y_q4 += y_step_q4;
-      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
     }
     ++src;
     ++dst;
@@ -213,58 +191,27 @@ static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
                        const int16_t *filter_y, int y_step_q4,
                        int w, int h, int taps) {
   /* Fixed size intermediate buffer places limits on parameters.
-   * Maximum intermediate_height is 135, for y_step_q4 == 32,
+   * Maximum intermediate_height is 324, for y_step_q4 == 80,
    * h == 64, taps == 8.
+   * y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
    */
-  uint8_t temp[64 * 135];
-  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
+  uint8_t temp[64 * 324];
+  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + taps;
 
   assert(w <= 64);
   assert(h <= 64);
   assert(taps <= 8);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
+  assert(y_step_q4 <= 80);
+  assert(x_step_q4 <= 80);
 
   if (intermediate_height < h)
     intermediate_height = h;
 
-  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
-                   temp, 64,
-                   filter_x, x_step_q4, filter_y, y_step_q4,
-                   w, intermediate_height, taps);
-  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride,
-                  filter_x, x_step_q4, filter_y, y_step_q4,
-                  w, h, taps);
-}
-
-static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h, int taps) {
-  /* Fixed size intermediate buffer places limits on parameters.
-   * Maximum intermediate_height is 135, for y_step_q4 == 32,
-   * h == 64, taps == 8.
-   */
-  uint8_t temp[64 * 135];
-  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(taps <= 8);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  if (intermediate_height < h)
-    intermediate_height = h;
-
-  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
-                   temp, 64,
-                   filter_x, x_step_q4, filter_y, y_step_q4,
-                   w, intermediate_height, taps);
-  convolve_avg_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, taps);
+  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, temp, 64,
+                   filter_x, x_step_q4, filter_y, y_step_q4, w,
+                   intermediate_height, taps);
+  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, filter_x,
+                  x_step_q4, filter_y, y_step_q4, w, h, taps);
 }
 
 void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -273,8 +220,7 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h) {
   convolve_horiz_c(src, src_stride, dst, dst_stride,
-                   filter_x, x_step_q4, filter_y, y_step_q4,
-                   w, h, 8);
+                   filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }
 
 void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -283,8 +229,7 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h) {
   convolve_avg_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8);
+                       filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }
 
 void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -293,8 +238,7 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                           const int16_t *filter_y, int y_step_q4,
                           int w, int h) {
   convolve_vert_c(src, src_stride, dst, dst_stride,
-                  filter_x, x_step_q4, filter_y, y_step_q4,
-                  w, h, 8);
+                  filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }
 
 void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -303,8 +247,7 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
   convolve_avg_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8);
+                      filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }
 
 void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -313,8 +256,7 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                      const int16_t *filter_y, int y_step_q4,
                      int w, int h) {
   convolve_c(src, src_stride, dst, dst_stride,
-             filter_x, x_step_q4, filter_y, y_step_q4,
-             w, h, 8);
+             filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
 }
 
 void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -327,16 +269,9 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
   assert(w <= 64);
   assert(h <= 64);
 
-  vp9_convolve8(src, src_stride,
-                temp, 64,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_avg(temp, 64,
-                   dst, dst_stride,
-                   NULL, 0, /* These unused parameter should be removed! */
-                   NULL, 0, /* These unused parameter should be removed! */
-                   w, h);
+  vp9_convolve8(src, src_stride, temp, 64,
+               filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  vp9_convolve_avg(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
 }
 
 void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -361,9 +296,9 @@ void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
   int x, y;
 
   for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = (dst[x] + src[x] + 1) >> 1;
-    }
+    for (x = 0; x < w; ++x)
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+
     src += src_stride;
     dst += dst_stride;
   }
diff --git a/libvpx/vp9/common/vp9_convolve.h b/libvpx/vp9/common/vp9_convolve.h
index 3de8111..13220e9 100644
--- a/libvpx/vp9/common/vp9_convolve.h
+++ b/libvpx/vp9/common/vp9_convolve.h
@@ -13,6 +13,8 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
+#define FILTER_BITS 7
+
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
diff --git a/libvpx/vp9/common/vp9_debugmodes.c b/libvpx/vp9/common/vp9_debugmodes.c
index 370ebe8..79f769e 100644
--- a/libvpx/vp9/common/vp9_debugmodes.c
+++ b/libvpx/vp9/common/vp9_debugmodes.c
@@ -22,23 +22,24 @@ static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) {
  * and uses the passed in member offset to print out the value of an integer
  * for each mbmi member value in the mi structure.
  */
-static void print_mi_data(VP9_COMMON *common, FILE *file, char *descriptor,
+static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor,
                           size_t member_offset) {
   int mi_row;
   int mi_col;
   int mi_index = 0;
-  MODE_INFO *mi = common->mi;
-  int rows = common->mi_rows;
-  int cols = common->mi_cols;
+  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
   char prefix = descriptor[0];
 
-  log_frame_info(common, descriptor, file);
+  log_frame_info(cm, descriptor, file);
   mi_index = 0;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(file, "%c ", prefix);
     for (mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(file, "%2d ",
-              *((int*) ((char *) (&mi[mi_index].mbmi) + member_offset)));
+              *((int*) ((char *) (&mi_8x8[mi_index]->mbmi) +
+                        member_offset)));
       mi_index++;
     }
     fprintf(file, "\n");
@@ -51,23 +52,23 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
   int mi_col;
   int mi_index = 0;
   FILE *mvs = fopen(file, "a");
-  MODE_INFO *mi = cm->mi;
+  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
 
   print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
   print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
-  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, mb_skip_coeff));
+  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip_coeff));
   print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
-  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, txfm_size));
+  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
   print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
 
   log_frame_info(cm, "Vectors ",mvs);
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs,"V ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row,
-              mi[mi_index].mbmi.mv[0].as_mv.col);
+      fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row,
+                               mi_8x8[mi_index]->mbmi.mv[0].as_mv.col);
       mi_index++;
     }
     fprintf(mvs, "\n");
diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c
index df3a9fe..32d9e0c 100644
--- a/libvpx/vp9/common/vp9_entropy.c
+++ b/libvpx/vp9/common/vp9_entropy.c
@@ -377,7 +377,7 @@ static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
 
 static void extend_model_to_full_distribution(vp9_prob p,
                                               vp9_prob *tree_probs) {
-  const int l = ((p - 1) / 2);
+  const int l = (p - 1) / 2;
   const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8;
   if (p & 1) {
     vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,
@@ -436,11 +436,11 @@ const vp9_extra_bit vp9_extra_bits[12] = {
 
 #include "vp9/common/vp9_default_coef_probs.h"
 
-void vp9_default_coef_probs(VP9_COMMON *pc) {
-  vp9_copy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
-  vp9_copy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
-  vp9_copy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
-  vp9_copy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
+void vp9_default_coef_probs(VP9_COMMON *cm) {
+  vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
+  vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
+  vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
+  vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
 // Neighborhood 5-tuples for various scans and blocksizes,
@@ -622,7 +622,6 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
   int t, i, j, k, l;
   unsigned int branch_ct[UNCONSTRAINED_NODES][2];
   vp9_prob coef_probs[UNCONSTRAINED_NODES];
-  int entropy_nodes_adapt = UNCONSTRAINED_NODES;
 
   for (i = 0; i < BLOCK_TYPES; ++i)
     for (j = 0; j < REF_TYPES; ++j)
@@ -635,7 +634,7 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
                                            0);
           branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
           coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          for (t = 0; t < entropy_nodes_adapt; ++t)
+          for (t = 0; t < UNCONSTRAINED_NODES; ++t)
             dst_coef_probs[i][j][k][l][t] = merge_probs(
                 pre_coef_probs[i][j][k][l][t], coef_probs[t],
                 branch_ct[t], count_sat, update_factor);
diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h
index 861c078..f138c09 100644
--- a/libvpx/vp9/common/vp9_entropy.h
+++ b/libvpx/vp9/common/vp9_entropy.h
@@ -95,7 +95,7 @@ typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
 #define MODULUS_PARAM               13  /* Modulus parameter */
 
 struct VP9Common;
-void vp9_default_coef_probs(struct VP9Common *);
+void vp9_default_coef_probs(struct VP9Common *cm);
 extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
 
 extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
@@ -154,19 +154,17 @@ extern DECLARE_ALIGNED(16, int16_t,
                        vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
 
 void vp9_coef_tree_initialize(void);
-void vp9_adapt_coef_probs(struct VP9Common *);
+void vp9_adapt_coef_probs(struct VP9Common *cm);
 
-static INLINE void vp9_reset_sb_tokens_context(MACROBLOCKD* const xd,
-                                               BLOCK_SIZE_TYPE bsize) {
-  /* Clear entropy contexts */
-  const int bw = 1 << b_width_log2(bsize);
-  const int bh = 1 << b_height_log2(bsize);
+static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
   int i;
   for (i = 0; i < MAX_MB_PLANE; i++) {
-    vpx_memset(xd->plane[i].above_context, 0,
-               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[i].subsampling_x);
-    vpx_memset(xd->plane[i].left_context, 0,
-               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[i].subsampling_y);
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    vpx_memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) *
+                   num_4x4_blocks_wide_lookup[plane_bsize]);
+    vpx_memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) *
+                   num_4x4_blocks_high_lookup[plane_bsize]);
   }
 }
 
@@ -338,6 +336,45 @@ static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
   }
 }
 
+static int get_entropy_context(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                               PLANE_TYPE type, int block_idx,
+                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
+                               const int16_t **scan,
+                               const uint8_t **band_translate) {
+  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
+
+  switch (tx_size) {
+    case TX_4X4:
+      *scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
+      *band_translate = vp9_coefband_trans_4x4;
+      above_ec = A[0] != 0;
+      left_ec = L[0] != 0;
+      break;
+    case TX_8X8:
+      *scan = get_scan_8x8(get_tx_type_8x8(type, xd));
+      *band_translate = vp9_coefband_trans_8x8plus;
+      above_ec = !!*(uint16_t *)A;
+      left_ec  = !!*(uint16_t *)L;
+      break;
+    case TX_16X16:
+      *scan = get_scan_16x16(get_tx_type_16x16(type, xd));
+      *band_translate = vp9_coefband_trans_8x8plus;
+      above_ec = !!*(uint32_t *)A;
+      left_ec  = !!*(uint32_t *)L;
+      break;
+    case TX_32X32:
+      *scan = vp9_default_scan_32x32;
+      *band_translate = vp9_coefband_trans_8x8plus;
+      above_ec = !!*(uint64_t *)A;
+      left_ec  = !!*(uint64_t *)L;
+      break;
+    default:
+      assert(!"Invalid transform size.");
+  }
+
+  return combine_entropy_contexts(above_ec, left_ec);
+}
+
 enum { VP9_COEF_UPDATE_PROB = 252 };
 
 #endif  // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c
index 768e5f5..93c89b0 100644
--- a/libvpx/vp9/common/vp9_entropymode.c
+++ b/libvpx/vp9/common/vp9_entropymode.c
@@ -14,8 +14,8 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"
 
-const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES]
-                                  [VP9_INTRA_MODES - 1] = {
+const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES]
+                                  [INTRA_MODES - 1] = {
   { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,
   { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,
   { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,
@@ -23,21 +23,21 @@ const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES]
   { 113,   9,  36, 155, 111, 157,  32,  44, 161 } /* y = d135 */,
   { 116,   9,  55, 176,  76,  96,  37,  61, 149 } /* y = d117 */,
   { 115,   9,  28, 141, 161, 167,  21,  25, 193 } /* y = d153 */,
-  { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d27 */,
+  { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d207 */,
   { 116,  12,  64, 120, 140, 125,  49, 115, 121 } /* y = d63 */,
   { 102,  19,  66, 162, 182, 122,  35,  59, 128 } /* y = tm */
 };
 
 static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS]
-                                        [VP9_INTRA_MODES - 1] = {
+                                        [INTRA_MODES - 1] = {
   {  65,  32,  18, 144, 162, 194,  41,  51,  98 } /* block_size < 8x8 */,
   { 132,  68,  18, 165, 217, 196,  45,  40,  78 } /* block_size < 16x16 */,
   { 173,  80,  19, 176, 240, 193,  64,  35,  46 } /* block_size < 32x32 */,
   { 221, 135,  38, 194, 248, 121,  96,  85,  29 } /* block_size >= 32x32 */
 };
 
-static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES]
-                                         [VP9_INTRA_MODES - 1] = {
+static const vp9_prob default_if_uv_probs[INTRA_MODES]
+                                         [INTRA_MODES - 1] = {
   { 120,   7,  76, 176, 208, 126,  28,  54, 103 } /* y = dc */,
   {  48,  12, 154, 155, 139,  90,  34, 117, 119 } /* y = v */,
   {  67,   6,  25, 204, 243, 158,  13,  21,  96 } /* y = h */,
@@ -45,7 +45,7 @@ static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES]
   {  83,   5,  42, 156, 111, 152,  26,  49, 152 } /* y = d135 */,
   {  80,   5,  58, 178,  74,  83,  33,  62, 145 } /* y = d117 */,
   {  86,   5,  32, 154, 192, 168,  14,  22, 163 } /* y = d153 */,
-  {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d27 */,
+  {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d207 */,
   {  77,   7,  64, 116, 132, 122,  37, 126, 120 } /* y = d63 */,
   { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */
 };
@@ -98,9 +98,9 @@ static const vp9_prob default_partition_probs[NUM_FRAME_TYPES]
   }
 };
 
-const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
-                                 [VP9_INTRA_MODES]
-                                 [VP9_INTRA_MODES - 1] = {
+const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
+                                 [INTRA_MODES]
+                                 [INTRA_MODES - 1] = {
   { /* above = dc */
     { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
     {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
@@ -109,7 +109,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
     {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
     {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
     {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
-    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
+    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d207 */,
     {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
     {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
   }, { /* above = v */
@@ -120,7 +120,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
     {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
     {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
     {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
-    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
+    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d207 */,
     {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
     {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
   }, { /* above = h */
@@ -131,7 +131,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
     {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
     {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
     {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
-    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
+    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d207 */,
     {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
     {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
   }, { /* above = d45 */
@@ -142,7 +142,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
     {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
     {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
     {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
-    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
+    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d207 */,
     {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
     {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
   }, { /* above = d135 */
@@ -153,7 +153,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
     {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
     {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
     {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
-    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
+    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d207 */,
     {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
     {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
   }, { /* above = d117 */
@@ -164,7 +164,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
     {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
     {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
     {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
-    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
+    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d207 */,
     {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
     {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
   }, { /* above = d153 */
@@ -175,10 +175,10 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
     {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
     {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
     {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
-    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
+    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d207 */,
     {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
     {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
-  }, { /* above = d27 */
+  }, { /* above = d207 */
     {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
     {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
     {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
@@ -186,7 +186,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
     {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
     {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
     {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
-    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
+    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d207 */,
     {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
     {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
   }, { /* above = d63 */
@@ -197,7 +197,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
     {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
     {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
     {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
-    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
+    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d207 */,
     {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
     {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
   }, { /* above = tm */
@@ -208,14 +208,14 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
     {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
     {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
     {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
-    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
+    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d207 */,
     {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
     {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
   }
 };
 
 static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
-                                              [VP9_INTER_MODES - 1] = {
+                                              [INTER_MODES - 1] = {
   {2,       173,   34},  // 0 = both zero mv
   {7,       145,   85},  // 1 = one zero mv + one a predicted mv
   {7,       166,   63},  // 2 = two predicted mvs
@@ -226,7 +226,7 @@ static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
 };
 
 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
+const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = {
   -DC_PRED, 2,                      /* 0 = DC_NODE */
   -TM_PRED, 4,                      /* 1 = TM_NODE */
   -V_PRED, 6,                       /* 2 = V_NODE */
@@ -235,7 +235,7 @@ const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
   -D135_PRED, -D117_PRED,           /* 5 = D135_NODE */
   -D45_PRED, 14,                    /* 6 = D45_NODE */
   -D63_PRED, 16,                    /* 7 = D63_NODE */
-  -D153_PRED, -D27_PRED             /* 8 = D153_NODE */
+  -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };
 
 const vp9_tree_index vp9_inter_mode_tree[6] = {
@@ -250,8 +250,8 @@ const vp9_tree_index vp9_partition_tree[6] = {
   -PARTITION_VERT, -PARTITION_SPLIT
 };
 
-struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
-struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];
+struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
+struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
 struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
@@ -317,8 +317,8 @@ static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = {
   192, 128, 64
 };
 
-static const vp9_prob default_switchable_interp_prob[VP9_SWITCHABLE_FILTERS+1]
-                                                  [VP9_SWITCHABLE_FILTERS-1] = {
+static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1]
+                                                  [SWITCHABLE_FILTERS-1] = {
   { 235, 162, },
   { 36, 255, },
   { 34, 3, },
@@ -338,14 +338,11 @@ void vp9_init_mbmode_probs(VP9_COMMON *cm) {
   vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
 }
 
-const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
-  -0, 2,
-  -1, -2
+const vp9_tree_index vp9_switchable_interp_tree[SWITCHABLE_FILTERS*2-2] = {
+  -EIGHTTAP, 2,
+  -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
-struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
-const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
-  EIGHTTAP, EIGHTTAP_SMOOTH, EIGHTTAP_SHARP};
-const int vp9_switchable_interp_map[SWITCHABLE + 1] = {1, 0, 2, -1, -1};
+struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
 
 void vp9_entropy_mode_init() {
   vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
@@ -403,17 +400,17 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
                                              counts->single_ref[i][j]);
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    update_mode_probs(VP9_INTER_MODES, vp9_inter_mode_tree,
+    update_mode_probs(INTER_MODES, vp9_inter_mode_tree,
                       counts->inter_mode[i], pre_fc->inter_mode_probs[i],
                       fc->inter_mode_probs[i], NEARESTMV);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
+    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
                       counts->y_mode[i], pre_fc->y_mode_prob[i],
                       fc->y_mode_prob[i], 0);
 
-  for (i = 0; i < VP9_INTRA_MODES; ++i)
-    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
+  for (i = 0; i < INTRA_MODES; ++i)
+    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
                       counts->uv_mode[i], pre_fc->uv_mode_prob[i],
                       fc->uv_mode_prob[i], 0);
 
@@ -424,8 +421,8 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
                       fc->partition_prob[INTER_FRAME][i], 0);
 
   if (cm->mcomp_filter_type == SWITCHABLE) {
-    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
-      update_mode_probs(VP9_SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
+    for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+      update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
                         counts->switchable_interp[i],
                         pre_fc->switchable_interp_prob[i],
                         fc->switchable_interp_prob[i], 0);
@@ -443,14 +440,12 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
         fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
                                              branch_ct_8x8p[j]);
 
-      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i],
-                                       branch_ct_16x16p);
+      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; ++j)
         fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
                                                branch_ct_16x16p[j]);
 
-      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i],
-                                       branch_ct_32x32p);
+      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; ++j)
         fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
                                                branch_ct_32x32p[j]);
@@ -475,14 +470,14 @@ static void set_default_lf_deltas(struct loopfilter *lf) {
   lf->mode_deltas[1] = 0;
 }
 
-void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
+void vp9_setup_past_independence(VP9_COMMON *cm) {
   // Reset the segment feature data to the default stats:
   // Features disabled, 0, with delta coding (Default state).
-  struct loopfilter *const lf = &xd->lf;
+  struct loopfilter *const lf = &cm->lf;
 
   int i;
-  vp9_clearall_segfeatures(&xd->seg);
-  xd->seg.abs_delta = SEGMENT_DELTADATA;
+  vp9_clearall_segfeatures(&cm->seg);
+  cm->seg.abs_delta = SEGMENT_DELTADATA;
   if (cm->last_frame_seg_map)
     vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
@@ -515,10 +510,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
              cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
 
   vp9_update_mode_info_border(cm, cm->mip);
-  vp9_update_mode_info_in_image(cm, cm->mi);
-
   vp9_update_mode_info_border(cm, cm->prev_mip);
-  vp9_update_mode_info_in_image(cm, cm->prev_mi);
 
   vp9_zero(cm->ref_frame_sign_bias);
 
diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h
index 17a7c26..4cf4c03 100644
--- a/libvpx/vp9/common/vp9_entropymode.h
+++ b/libvpx/vp9/common/vp9_entropymode.h
@@ -16,8 +16,8 @@
 
 #define SUBMVREF_COUNT 5
 #define TX_SIZE_CONTEXTS 2
-#define VP9_MODE_UPDATE_PROB  252
-#define VP9_SWITCHABLE_FILTERS 3   // number of switchable filters
+#define MODE_UPDATE_PROB  252
+#define SWITCHABLE_FILTERS 3   // number of switchable filters
 
 // #define MODE_STATS
 
@@ -35,37 +35,32 @@ struct tx_counts {
   unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
 };
 
-extern const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
-extern const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES]
-                                        [VP9_INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
+                                        [INTRA_MODES - 1];
 
 extern const vp9_tree_index vp9_intra_mode_tree[];
 extern const vp9_tree_index vp9_inter_mode_tree[];
 
-extern struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
-extern struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];
+extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
+extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
 // probability models for partition information
 extern const vp9_tree_index vp9_partition_tree[];
 extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
-extern const INTERPOLATIONFILTERTYPE vp9_switchable_interp
-                 [VP9_SWITCHABLE_FILTERS];
-
-extern const int vp9_switchable_interp_map[SWITCHABLE + 1];
-
 extern const vp9_tree_index vp9_switchable_interp_tree
-                 [2 * (VP9_SWITCHABLE_FILTERS - 1)];
+                 [2 * (SWITCHABLE_FILTERS - 1)];
 
-extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
 
 void vp9_entropy_mode_init();
 
-void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);
+void vp9_setup_past_independence(struct VP9Common *cm);
 
-void vp9_init_mbmode_probs(struct VP9Common *x);
+void vp9_init_mbmode_probs(struct VP9Common *cm);
 
-void vp9_adapt_mode_probs(struct VP9Common *);
+void vp9_adapt_mode_probs(struct VP9Common *cm);
 
 void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
                                       unsigned int (*ct_32x32p)[2]);
diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c
index 6cfc346..2e973e5 100644
--- a/libvpx/vp9/common/vp9_entropymv.c
+++ b/libvpx/vp9/common/vp9_entropymv.c
@@ -79,20 +79,59 @@ static const nmv_context default_nmv_context = {
 
 #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)
 
+static const uint8_t log_in_base_2[] = {
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
+};
+
 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
   MV_CLASS_TYPE c = MV_CLASS_0;
-  if      (z < CLASS0_SIZE * 8)    c = MV_CLASS_0;
-  else if (z < CLASS0_SIZE * 16)   c = MV_CLASS_1;
-  else if (z < CLASS0_SIZE * 32)   c = MV_CLASS_2;
-  else if (z < CLASS0_SIZE * 64)   c = MV_CLASS_3;
-  else if (z < CLASS0_SIZE * 128)  c = MV_CLASS_4;
-  else if (z < CLASS0_SIZE * 256)  c = MV_CLASS_5;
-  else if (z < CLASS0_SIZE * 512)  c = MV_CLASS_6;
-  else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;
-  else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8;
-  else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9;
-  else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10;
-  else assert(0);
+  if (z >= CLASS0_SIZE * 4096)
+    c = MV_CLASS_10;
+  else
+    c = log_in_base_2[z >> 3];
+
   if (offset)
     *offset = z - mv_class_base(c);
   return c;
@@ -110,8 +149,6 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                              int incr, int usehp) {
   int s, z, c, o, d, e, f;
-  if (!incr)
-    return;
   assert (v != 0);            /* should not be zero */
   s = v < 0;
   comp_counts->sign[s] += incr;
@@ -123,61 +160,39 @@ static void inc_mv_component(int v, nmv_component_counts *comp_counts,
   d = (o >> 3);               /* int mv data */
   f = (o >> 1) & 3;           /* fractional pel mv data */
   e = (o & 1);                /* high precision mv data */
+
   if (c == MV_CLASS_0) {
     comp_counts->class0[d] += incr;
+    comp_counts->class0_fp[d][f] += incr;
+    comp_counts->class0_hp[e] += usehp * incr;
   } else {
     int i;
     int b = c + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < b; ++i)
       comp_counts->bits[i][((d >> i) & 1)] += incr;
-  }
-
-  /* Code the fractional pel bits */
-  if (c == MV_CLASS_0) {
-    comp_counts->class0_fp[d][f] += incr;
-  } else {
     comp_counts->fp[f] += incr;
-  }
-
-  /* Code the high precision bit */
-  if (usehp) {
-    if (c == MV_CLASS_0) {
-      comp_counts->class0_hp[e] += incr;
-    } else {
-      comp_counts->hp[e] += incr;
-    }
+    comp_counts->hp[e] += usehp * incr;
   }
 }
 
-static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
-  int v;
-  vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
-  for (v = 1; v <= MV_MAX; v++) {
-    inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
-    inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
-  }
-}
 
 void vp9_inc_mv(const MV *mv,  nmv_context_counts *counts) {
   const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
   ++counts->joints[j];
 
-  if (mv_joint_vertical(j))
-    ++counts->comps[0].mvcount[MV_MAX + mv->row];
+  if (mv_joint_vertical(j)) {
+    inc_mv_component(mv->row, &counts->comps[0], 1, 1);
+  }
 
-  if (mv_joint_horizontal(j))
-    ++counts->comps[1].mvcount[MV_MAX + mv->col];
+  if (mv_joint_horizontal(j)) {
+    inc_mv_component(mv->col, &counts->comps[1], 1, 1);
+  }
 }
 
 static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
   return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
 }
 
-void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
-  counts_to_context(&nmv_count->comps[0], usehp);
-  counts_to_context(&nmv_count->comps[1], usehp);
-}
-
 static unsigned int adapt_probs(unsigned int i,
                                 vp9_tree tree,
                                 vp9_prob this_probs[],
@@ -207,8 +222,6 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   nmv_context *pre_ctx = &pre_fc->nmvc;
   nmv_context_counts *cts = &cm->counts.mv;
 
-  vp9_counts_process(cts, allow_hp);
-
   adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
 
   for (i = 0; i < 2; ++i) {
diff --git a/libvpx/vp9/common/vp9_entropymv.h b/libvpx/vp9/common/vp9_entropymv.h
index 85a1f3a..a10c933 100644
--- a/libvpx/vp9/common/vp9_entropymv.h
+++ b/libvpx/vp9/common/vp9_entropymv.h
@@ -24,7 +24,7 @@ void vp9_init_mv_probs(struct VP9Common *cm);
 void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
 int vp9_use_mv_hp(const MV *ref);
 
-#define VP9_NMV_UPDATE_PROB  252
+#define NMV_UPDATE_PROB  252
 
 /* Symbols for coding which components are zero jointly */
 #define MV_JOINTS     4
@@ -126,6 +126,4 @@ typedef struct {
 
 void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);
 
-void vp9_counts_process(nmv_context_counts *NMVcount, int usehp);
-
 #endif  // VP9_COMMON_VP9_ENTROPYMV_H_
diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h
index 3208b72..1bf0742 100644
--- a/libvpx/vp9/common/vp9_enums.h
+++ b/libvpx/vp9/common/vp9_enums.h
@@ -13,37 +13,40 @@
 
 #include "./vpx_config.h"
 
-#define LOG2_MI_SIZE 3
-#define LOG2_MI_BLOCK_SIZE (6 - LOG2_MI_SIZE)  // 64 = 2^6
+#define MI_SIZE_LOG2 3
+#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2)  // 64 = 2^6
 
-#define MI_SIZE (1 << LOG2_MI_SIZE)  // pixels per mi-unit
-#define MI_BLOCK_SIZE (1 << LOG2_MI_BLOCK_SIZE)  // mi-units per max block
+#define MI_SIZE (1 << MI_SIZE_LOG2)  // pixels per mi-unit
+#define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2)  // mi-units per max block
 
 #define MI_MASK (MI_BLOCK_SIZE - 1)
 
-typedef enum BLOCK_SIZE_TYPE {
-  BLOCK_SIZE_AB4X4, BLOCK_4X4 = BLOCK_SIZE_AB4X4,
-  BLOCK_SIZE_SB4X8, BLOCK_4X8 = BLOCK_SIZE_SB4X8,
-  BLOCK_SIZE_SB8X4, BLOCK_8X4 = BLOCK_SIZE_SB8X4,
-  BLOCK_SIZE_SB8X8, BLOCK_8X8 = BLOCK_SIZE_SB8X8,
-  BLOCK_SIZE_SB8X16, BLOCK_8X16 = BLOCK_SIZE_SB8X16,
-  BLOCK_SIZE_SB16X8, BLOCK_16X8 = BLOCK_SIZE_SB16X8,
-  BLOCK_SIZE_MB16X16, BLOCK_16X16 = BLOCK_SIZE_MB16X16,
-  BLOCK_SIZE_SB16X32, BLOCK_16X32 = BLOCK_SIZE_SB16X32,
-  BLOCK_SIZE_SB32X16, BLOCK_32X16 = BLOCK_SIZE_SB32X16,
-  BLOCK_SIZE_SB32X32, BLOCK_32X32 = BLOCK_SIZE_SB32X32,
-  BLOCK_SIZE_SB32X64, BLOCK_32X64 = BLOCK_SIZE_SB32X64,
-  BLOCK_SIZE_SB64X32, BLOCK_64X32 = BLOCK_SIZE_SB64X32,
-  BLOCK_SIZE_SB64X64, BLOCK_64X64 = BLOCK_SIZE_SB64X64,
-  BLOCK_SIZE_TYPES, BLOCK_MAX_SB_SEGMENTS = BLOCK_SIZE_TYPES
-} BLOCK_SIZE_TYPE;
+
+typedef enum BLOCK_SIZE {
+  BLOCK_4X4,
+  BLOCK_4X8,
+  BLOCK_8X4,
+  BLOCK_8X8,
+  BLOCK_8X16,
+  BLOCK_16X8,
+  BLOCK_16X16,
+  BLOCK_16X32,
+  BLOCK_32X16,
+  BLOCK_32X32,
+  BLOCK_32X64,
+  BLOCK_64X32,
+  BLOCK_64X64,
+  BLOCK_SIZES,
+  BLOCK_INVALID = BLOCK_SIZES
+} BLOCK_SIZE;
 
 typedef enum PARTITION_TYPE {
   PARTITION_NONE,
   PARTITION_HORZ,
   PARTITION_VERT,
   PARTITION_SPLIT,
-  PARTITION_TYPES, PARTITION_INVALID = PARTITION_TYPES
+  PARTITION_TYPES,
+  PARTITION_INVALID = PARTITION_TYPES
 } PARTITION_TYPE;
 
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
diff --git a/libvpx/vp9/common/vp9_extend.c b/libvpx/vp9/common/vp9_extend.c
index d8496c4..07c68c8 100644
--- a/libvpx/vp9/common/vp9_extend.c
+++ b/libvpx/vp9/common/vp9_extend.c
@@ -57,15 +57,23 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
 
 void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                YV12_BUFFER_CONFIG *dst) {
-  const int et_y = dst->border;
-  const int el_y = dst->border;
-  const int eb_y = dst->border + dst->y_height - src->y_height;
-  const int er_y = dst->border + dst->y_width - src->y_width;
-
-  const int et_uv = dst->border >> (dst->uv_height != dst->y_height);
-  const int el_uv = dst->border >> (dst->uv_width != dst->y_width);
-  const int eb_uv = et_uv + dst->uv_height - src->uv_height;
-  const int er_uv = el_uv + dst->uv_width - src->uv_width;
+  // Extend src frame in buffer
+  // Altref filtering assumes 16 pixel extension
+  const int et_y = 16;
+  const int el_y = 16;
+  // Motion estimation may use src block variance with the block size up
+  // to 64x64, so the right and bottom need to be extended to 64 mulitple
+  // or up to 16, whichever is greater.
+  const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width,
+                       16);
+  const int er_y = MAX(ALIGN_POWER_OF_TWO(src->y_height, 6) - src->y_height,
+                       16);
+  const int uv_width_subsampling = (src->uv_width != src->y_width);
+  const int uv_height_subsampling = (src->uv_height != src->y_height);
+  const int et_uv = et_y >> uv_height_subsampling;
+  const int el_uv = el_y >> uv_width_subsampling;
+  const int eb_uv = eb_y >> uv_height_subsampling;
+  const int er_uv = er_y >> uv_width_subsampling;
 
 #if CONFIG_ALPHA
   const int et_a = dst->border >> (dst->alpha_height != dst->y_height);
diff --git a/libvpx/vp9/common/vp9_filter.c b/libvpx/vp9/common/vp9_filter.c
index e5503cd..4ac2bc9 100644
--- a/libvpx/vp9/common/vp9_filter.c
+++ b/libvpx/vp9/common/vp9_filter.c
@@ -8,14 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "vpx_ports/mem.h"
 
-#include <stdlib.h>
 #include "vp9/common/vp9_filter.h"
-#include "vpx_ports/mem.h"
-#include "vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
 
-DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {
+DECLARE_ALIGNED(256, const int16_t,
+                vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
   { 0, 0, 0, 120,   8, 0, 0, 0 },
   { 0, 0, 0, 112,  16, 0, 0, 0 },
@@ -34,8 +32,9 @@ DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {
   { 0, 0, 0,   8, 120, 0, 0, 0 }
 };
 
-DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
-  /* Lagrangian interpolation filter */
+// Lagrangian interpolation filter
+DECLARE_ALIGNED(256, const int16_t,
+                vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
   { 0,   0,   0, 128,   0,   0,   0,  0},
   { 0,   1,  -5, 126,   8,  -3,   1,  0},
   { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -54,9 +53,9 @@ DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
   { 0,   1,  -3,   8, 126,  -5,   1,  0}
 };
 
-DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8])
-    = {
-  /* dct based filter */
+// DCT based filter
+DECLARE_ALIGNED(256, const int16_t,
+                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
   {0,   0,   0, 128,   0,   0,   0, 0},
   {-1,   3,  -7, 127,   8,  -3,   1, 0},
   {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -75,9 +74,9 @@ DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8])
   {0,   1,  -3,   8, 127,  -7,   3, -1}
 };
 
+// freqmultiplier = 0.5
 DECLARE_ALIGNED(256, const int16_t,
-                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = {
-  /* freqmultiplier = 0.5 */
+                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
   { 0,  0,  0, 128,  0,  0,  0,  0},
   {-3, -1, 32,  64, 38,  1, -3,  0},
   {-2, -2, 29,  63, 41,  2, -3,  0},
diff --git a/libvpx/vp9/common/vp9_filter.h b/libvpx/vp9/common/vp9_filter.h
index 1ccfdaa..7b1ffae 100644
--- a/libvpx/vp9/common/vp9_filter.h
+++ b/libvpx/vp9/common/vp9_filter.h
@@ -12,26 +12,22 @@
 #define VP9_COMMON_VP9_FILTER_H_
 
 #include "vpx_config.h"
-#include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
 
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP9_FILTER_WEIGHT 128
-#define VP9_FILTER_SHIFT  7
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
 
-#define SUBPEL_SHIFTS 16
-
-extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8];
-extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8];
-extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
-extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
-extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS];
+extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][SUBPEL_TAPS];
+extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS];
+extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS];
+extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS];
 
 // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
 // filter kernel as a 2 tap filter.
-#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \
-                   sizeof(vp9_bilinear_filters[0][0]))
-#define BF_OFFSET (BF_LENGTH / 2 - 1)
-#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET)
+#define BILINEAR_FILTERS_2TAP(x) \
+  (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1)
 
 #endif  // VP9_COMMON_VP9_FILTER_H_
diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c
index 3af8b8d..49a731f 100644
--- a/libvpx/vp9/common/vp9_findnearmv.c
+++ b/libvpx/vp9/common/vp9_findnearmv.c
@@ -8,19 +8,16 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <limits.h>
-
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_sadmxn.h"
 
-static void lower_mv_precision(int_mv *mv, int allow_hp) {
-  const int use_hp = allow_hp && vp9_use_mv_hp(&mv->as_mv);
+static void lower_mv_precision(MV *mv, int allow_hp) {
+  const int use_hp = allow_hp && vp9_use_mv_hp(mv);
   if (!use_hp) {
-    if (mv->as_mv.row & 1)
-      mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
-    if (mv->as_mv.col & 1)
-      mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1);
+    if (mv->row & 1)
+      mv->row += (mv->row > 0 ? -1 : 1);
+    if (mv->col & 1)
+      mv->col += (mv->col > 0 ? -1 : 1);
   }
 }
 
@@ -32,7 +29,7 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
   int i;
   // Make sure all the candidates are properly clamped etc
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-    lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv);
+    lower_mv_precision(&mvlist[i].as_mv, xd->allow_high_precision_mv);
     clamp_mv2(&mvlist[i].as_mv, xd);
   }
   *nearest = mvlist[0];
@@ -46,17 +43,14 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    int mi_row, int mi_col) {
   int_mv dst_list[MAX_MV_REF_CANDIDATES];
   int_mv mv_list[MAX_MV_REF_CANDIDATES];
-  MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  MODE_INFO *const mi = xd->this_mi;
 
   assert(ref_idx == 0 || ref_idx == 1);
   assert(MAX_MV_REF_CANDIDATES == 2);  // makes code here slightly easier
 
-  vp9_find_mv_refs_idx(cm, xd, xd->mode_info_context,
-                       xd->prev_mode_info_context,
-                       mbmi->ref_frame[ref_idx],
-                       mv_list, cm->ref_frame_sign_bias, block_idx,
-                       mi_row, mi_col);
+  vp9_find_mv_refs_idx(cm, xd, mi, xd->last_mi,
+                       mi->mbmi.ref_frame[ref_idx],
+                       mv_list, block_idx, mi_row, mi_col);
 
   dst_list[1].as_int = 0;
   if (block_idx == 0) {
diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h
index e5221ed..ad0d882 100644
--- a/libvpx/vp9/common/vp9_findnearmv.h
+++ b/libvpx/vp9/common/vp9_findnearmv.h
@@ -36,48 +36,57 @@ static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
                xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
-void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm,
                                    MACROBLOCKD *xd,
                                    int_mv *dst_nearest,
                                    int_mv *dst_near,
                                    int block_idx, int ref_idx,
                                    int mi_row, int mi_col);
 
-static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
+static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb,
+                                          const MODE_INFO *left_mb, int b) {
   // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
   // understand this condition. This will go away soon.
+  const MODE_INFO *mi = cur_mb;
+
   if (b == 0 || b == 2) {
     /* On L edge, get from MB to left of us */
-    --cur_mb;
+    mi = left_mb;
+    if (!mi)
+      return DC_PRED;
 
-    if (is_inter_block(&cur_mb->mbmi)) {
+    if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
       return DC_PRED;
-    } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
-      return ((cur_mb->bmi + 1 + b)->as_mode);
+    } else if (mi->mbmi.sb_type < BLOCK_8X8) {
+      return ((mi->bmi + 1 + b)->as_mode);
     } else {
-      return cur_mb->mbmi.mode;
+      return mi->mbmi.mode;
     }
   }
   assert(b == 1 || b == 3);
-  return (cur_mb->bmi + b - 1)->as_mode;
+  return (mi->bmi + b - 1)->as_mode;
 }
 
 static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
-                                          int b, int mi_stride) {
+                                           const MODE_INFO *above_mb, int b) {
+  const MODE_INFO *mi = cur_mb;
+
   if (!(b >> 1)) {
     /* On top edge, get from MB above us */
-    cur_mb -= mi_stride;
+    mi = above_mb;
+    if (!mi)
+      return DC_PRED;
 
-    if (is_inter_block(&cur_mb->mbmi)) {
+    if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
       return DC_PRED;
-    } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
-      return ((cur_mb->bmi + 2 + b)->as_mode);
+    } else if (mi->mbmi.sb_type < BLOCK_8X8) {
+      return ((mi->bmi + 2 + b)->as_mode);
     } else {
-      return cur_mb->mbmi.mode;
+      return mi->mbmi.mode;
     }
   }
 
-  return (cur_mb->bmi + b - 2)->as_mode;
+  return (mi->bmi + b - 2)->as_mode;
 }
 
 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/libvpx/vp9/common/vp9_idct.h b/libvpx/vp9/common/vp9_idct.h
index 2d959f0..0c47da6 100644
--- a/libvpx/vp9/common/vp9_idct.h
+++ b/libvpx/vp9/common/vp9_idct.h
@@ -27,6 +27,9 @@
 #define pair_set_epi16(a, b) \
   _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))
 
+#define pair_set_epi32(a, b) \
+  _mm_set_epi32(b, a, b, a)
+
 // Constants:
 //  for (int i = 1; i< 32; ++i)
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index 66df627..cfb5cd4 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -22,13 +22,217 @@ struct loop_filter_info {
   const uint8_t *hev_thr;
 };
 
+// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block.   Above_ entries refer to whether or not to apply a
+// filter on the above border.   Int_ entries refer to whether or not to
+// apply borders on the 4x4 edges within the 8x8 block that each bit
+// represents.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+  uint64_t left_y[TX_SIZES];
+  uint64_t above_y[TX_SIZES];
+  uint64_t int_4x4_y;
+  uint16_t left_uv[TX_SIZES];
+  uint16_t above_uv[TX_SIZES];
+  uint16_t int_4x4_uv;
+} LOOP_FILTER_MASK;
+
+// 64 bit masks for left transform size.  Each 1 represents a position where
+// we should apply a loop filter across the left border of an 8x8 block
+// boundary.
+//
+// In the case of TX_16X16->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//
+// A loopfilter should be applied to every other 8x8 horizontally.
+static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
+    0xffffffffffffffff,  // TX_4X4
+    0xffffffffffffffff,  // TX_8x8
+    0x5555555555555555,  // TX_16x16
+    0x1111111111111111,  // TX_32x32
+};
+
+// 64 bit masks for above transform size.  Each 1 represents a position where
+// we should apply a loop filter across the top border of an 8x8 block
+// boundary.
+//
+// In the case of TX_32x32 ->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    11111111
+//    00000000
+//    00000000
+//    00000000
+//    11111111
+//    00000000
+//    00000000
+//    00000000
+//
+// A loopfilter should be applied to every other 4 the row vertically.
+static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
+    0xffffffffffffffff,  // TX_4X4
+    0xffffffffffffffff,  // TX_8x8
+    0x00ff00ff00ff00ff,  // TX_16x16
+    0x000000ff000000ff,  // TX_32x32
+};
+
+// 64 bit masks for prediction sizes (left).  Each 1 represents a position
+// where left border of an 8x8 block.  These are aligned to the right most
+// appropriate bit,  and then shifted into place.
+//
+// In the case of TX_16x32 ->  ( low order byte first ) we end up with
+// a mask that looks like this :
+//
+//  10000000
+//  10000000
+//  10000000
+//  10000000
+//  00000000
+//  00000000
+//  00000000
+//  00000000
+static const uint64_t left_prediction_mask[BLOCK_SIZES] = {
+    0x0000000000000001,  // BLOCK_4X4,
+    0x0000000000000001,  // BLOCK_4X8,
+    0x0000000000000001,  // BLOCK_8X4,
+    0x0000000000000001,  // BLOCK_8X8,
+    0x0000000000000101,  // BLOCK_8X16,
+    0x0000000000000001,  // BLOCK_16X8,
+    0x0000000000000101,  // BLOCK_16X16,
+    0x0000000001010101,  // BLOCK_16X32,
+    0x0000000000000101,  // BLOCK_32X16,
+    0x0000000001010101,  // BLOCK_32X32,
+    0x0101010101010101,  // BLOCK_32X64,
+    0x0000000001010101,  // BLOCK_64X32,
+    0x0101010101010101,  // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each prediction size.
+static const uint64_t above_prediction_mask[BLOCK_SIZES] = {
+    0x0000000000000001,  // BLOCK_4X4
+    0x0000000000000001,  // BLOCK_4X8
+    0x0000000000000001,  // BLOCK_8X4
+    0x0000000000000001,  // BLOCK_8X8
+    0x0000000000000001,  // BLOCK_8X16,
+    0x0000000000000003,  // BLOCK_16X8
+    0x0000000000000003,  // BLOCK_16X16
+    0x0000000000000003,  // BLOCK_16X32,
+    0x000000000000000f,  // BLOCK_32X16,
+    0x000000000000000f,  // BLOCK_32X32,
+    0x000000000000000f,  // BLOCK_32X64,
+    0x00000000000000ff,  // BLOCK_64X32,
+    0x00000000000000ff,  // BLOCK_64X64
+};
+// 64 bit mask to shift and set for each prediction size.  A bit is set for
+// each 8x8 block that would be in the left most block of the given block
+// size in the 64x64 block.
+static const uint64_t size_mask[BLOCK_SIZES] = {
+    0x0000000000000001,  // BLOCK_4X4
+    0x0000000000000001,  // BLOCK_4X8
+    0x0000000000000001,  // BLOCK_8X4
+    0x0000000000000001,  // BLOCK_8X8
+    0x0000000000000101,  // BLOCK_8X16,
+    0x0000000000000003,  // BLOCK_16X8
+    0x0000000000000303,  // BLOCK_16X16
+    0x0000000003030303,  // BLOCK_16X32,
+    0x0000000000000f0f,  // BLOCK_32X16,
+    0x000000000f0f0f0f,  // BLOCK_32X32,
+    0x0f0f0f0f0f0f0f0f,  // BLOCK_32X64,
+    0x00000000ffffffff,  // BLOCK_64X32,
+    0xffffffffffffffff,  // BLOCK_64X64
+};
+
+// These are used for masking the left and above borders.
+static const uint64_t left_border =  0x1111111111111111;
+static const uint64_t above_border = 0x000000ff000000ff;
+
+// 16 bit masks for uv transform sizes.
+static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
+    0xffff,  // TX_4X4
+    0xffff,  // TX_8x8
+    0x5555,  // TX_16x16
+    0x1111,  // TX_32x32
+};
+
+static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
+    0xffff,  // TX_4X4
+    0xffff,  // TX_8x8
+    0x0f0f,  // TX_16x16
+    0x000f,  // TX_32x32
+};
+
+// 16 bit left mask to shift and set for each uv prediction size.
+static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = {
+    0x0001,  // BLOCK_4X4,
+    0x0001,  // BLOCK_4X8,
+    0x0001,  // BLOCK_8X4,
+    0x0001,  // BLOCK_8X8,
+    0x0001,  // BLOCK_8X16,
+    0x0001,  // BLOCK_16X8,
+    0x0001,  // BLOCK_16X16,
+    0x0011,  // BLOCK_16X32,
+    0x0001,  // BLOCK_32X16,
+    0x0011,  // BLOCK_32X32,
+    0x1111,  // BLOCK_32X64
+    0x0011,  // BLOCK_64X32,
+    0x1111,  // BLOCK_64X64
+};
+// 16 bit above mask to shift and set for uv each prediction size.
+static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = {
+    0x0001,  // BLOCK_4X4
+    0x0001,  // BLOCK_4X8
+    0x0001,  // BLOCK_8X4
+    0x0001,  // BLOCK_8X8
+    0x0001,  // BLOCK_8X16,
+    0x0001,  // BLOCK_16X8
+    0x0001,  // BLOCK_16X16
+    0x0001,  // BLOCK_16X32,
+    0x0003,  // BLOCK_32X16,
+    0x0003,  // BLOCK_32X32,
+    0x0003,  // BLOCK_32X64,
+    0x000f,  // BLOCK_64X32,
+    0x000f,  // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each uv prediction size
+static const uint16_t size_mask_uv[BLOCK_SIZES] = {
+    0x0001,  // BLOCK_4X4
+    0x0001,  // BLOCK_4X8
+    0x0001,  // BLOCK_8X4
+    0x0001,  // BLOCK_8X8
+    0x0001,  // BLOCK_8X16,
+    0x0001,  // BLOCK_16X8
+    0x0001,  // BLOCK_16X16
+    0x0011,  // BLOCK_16X32,
+    0x0003,  // BLOCK_32X16,
+    0x0033,  // BLOCK_32X32,
+    0x3333,  // BLOCK_32X64,
+    0x00ff,  // BLOCK_64X32,
+    0xffff,  // BLOCK_64X64
+};
+static const uint16_t left_border_uv =  0x1111;
+static const uint16_t above_border_uv = 0x000f;
+
+
 static void lf_init_lut(loop_filter_info_n *lfi) {
   lfi->mode_lf_lut[DC_PRED] = 0;
   lfi->mode_lf_lut[D45_PRED] = 0;
   lfi->mode_lf_lut[D135_PRED] = 0;
   lfi->mode_lf_lut[D117_PRED] = 0;
   lfi->mode_lf_lut[D153_PRED] = 0;
-  lfi->mode_lf_lut[D27_PRED] = 0;
+  lfi->mode_lf_lut[D207_PRED] = 0;
   lfi->mode_lf_lut[D63_PRED] = 0;
   lfi->mode_lf_lut[V_PRED] = 0;
   lfi->mode_lf_lut[H_PRED] = 0;
@@ -39,7 +243,7 @@ static void lf_init_lut(loop_filter_info_n *lfi) {
   lfi->mode_lf_lut[NEWMV] = 1;
 }
 
-static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) {
+static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
   int lvl;
 
   // For each possible value for the loop filter fill out limits
@@ -61,8 +265,9 @@ static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) {
   }
 }
 
-void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) {
+void vp9_loop_filter_init(VP9_COMMON *cm) {
   loop_filter_info_n *lfi = &cm->lf_info;
+  struct loopfilter *lf = &cm->lf;
   int i;
 
   // init limits for given sharpness
@@ -77,16 +282,15 @@ void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) {
     vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
 }
 
-void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                int default_filt_lvl) {
+void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
   int seg_id;
   // n_shift is the a multiplier for lf_deltas
   // the multiplier is 1 for when filter_lvl is between 0 and 31;
   // 2 when filter_lvl is between 32 and 63
   const int n_shift = default_filt_lvl >> 5;
   loop_filter_info_n *const lfi = &cm->lf_info;
-  struct loopfilter *const lf = &xd->lf;
-  struct segmentation *const seg = &xd->seg;
+  struct loopfilter *const lf = &cm->lf;
+  struct segmentation *const seg = &cm->seg;
 
   // update limits if sharpness has changed
   if (lf->last_sharpness_level != lf->sharpness_level) {
@@ -98,7 +302,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;
 
     // Set the baseline filter values for each segment
-    if (vp9_segfeature_active(&xd->seg, seg_id, SEG_LVL_ALT_LF)) {
+    if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
       const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
       lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
                   ? data
@@ -108,7 +312,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     if (!lf->mode_ref_delta_enabled) {
       // we could get rid of this if we assume that deltas are set to
       // zero when not in use; encoder always uses deltas
-      vpx_memset(lfi->lvl[seg_id][0], lvl_seg, 4 * 4);
+      vpx_memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
       continue;
     }
 
@@ -124,9 +328,9 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   }
 }
 
-static int build_lfi(const loop_filter_info_n *const lfi_n,
-                     const MB_MODE_INFO *const mbmi,
-                     struct loop_filter_info *const lfi) {
+static int build_lfi(const loop_filter_info_n *lfi_n,
+                     const MB_MODE_INFO *mbmi,
+                     struct loop_filter_info *lfi) {
   const int seg = mbmi->segment_id;
   const int ref = mbmi->ref_frame[0];
   const int mode = lfi_n->mode_lf_lut[mbmi->mode];
@@ -236,10 +440,360 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
   }
 }
 
-static void filter_block_plane(VP9_COMMON *const cm,
-                               struct macroblockd_plane *const plane,
-                               const MODE_INFO *mi,
-                               int mi_row, int mi_col) {
+// This function ors into the current lfm structure, where to do loop
+// filters for the specific mi we are looking at.   It uses information
+// including the block_size_type (32x16, 32x32, etc),  the transform size,
+// whether there were any coefficients encoded, and the loop filter strength
+// block we are currently looking at. Shift is used to position the
+// 1's we produce.
+// TODO(JBB) Need another function for different resolution color..
+static void build_masks(const loop_filter_info_n *const lfi_n,
+                        const MODE_INFO *mi, const int shift_y,
+                        const int shift_uv,
+                        LOOP_FILTER_MASK *lfm) {
+  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
+  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
+  const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi);
+  const int skip = mi->mbmi.skip_coeff;
+  const int seg = mi->mbmi.segment_id;
+  const int ref = mi->mbmi.ref_frame[0];
+  const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
+  const int filter_level = lfi_n->lvl[seg][ref][mode];
+  uint64_t *left_y = &lfm->left_y[tx_size_y];
+  uint64_t *above_y = &lfm->above_y[tx_size_y];
+  uint64_t *int_4x4_y = &lfm->int_4x4_y;
+  uint16_t *left_uv = &lfm->left_uv[tx_size_uv];
+  uint16_t *above_uv = &lfm->above_uv[tx_size_uv];
+  uint16_t *int_4x4_uv = &lfm->int_4x4_uv;
+
+  // If filter level is 0 we don't loop filter.
+  if (!filter_level)
+    return;
+
+  // These set 1 in the current block size for the block size edges.
+  // For instance if the block size is 32x16,   we'll set :
+  //    above =   1111
+  //              0000
+  //    and
+  //    left  =   1000
+  //          =   1000
+  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
+  //        1,  not 8...
+  //
+  // U and v set things on a 16 bit scale.
+  //
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+  *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
+
+  // If the block has no coefficients and is not intra we skip applying
+  // the loop filter on block edges.
+  if (skip && ref > INTRA_FRAME)
+    return;
+
+  // Here we are adding a mask for the transform size.  The transform
+  // size mask is set to be correct for a 64x64 prediction block size. We
+  // mask to match the size of the block we are working on and then shift it
+  // into place..
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+  *above_uv |= (size_mask_uv[block_size] &
+                above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+  *left_uv |= (size_mask_uv[block_size] &
+               left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  // Here we are trying to determine what to do with the internal 4x4 block
+  // boundaries.  These differ from the 4x4 boundaries on the outside edge of
+  // an 8x8 in that the internal ones can be skipped and don't depend on
+  // the prediction block size.
+  if (tx_size_y == TX_4X4) {
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+  }
+  if (tx_size_uv == TX_4X4) {
+    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
+  }
+}
+
+// This function does the same thing as the one above with the exception that
+// it only affects the y masks.   It exists because for blocks < 16x16 in size,
+// we only update u and v masks on the first block.
+static void build_y_mask(const loop_filter_info_n *const lfi_n,
+                         const MODE_INFO *mi, const int shift_y,
+                         LOOP_FILTER_MASK *lfm) {
+  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
+  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
+  const int skip = mi->mbmi.skip_coeff;
+  const int seg = mi->mbmi.segment_id;
+  const int ref = mi->mbmi.ref_frame[0];
+  const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
+  const int filter_level = lfi_n->lvl[seg][ref][mode];
+  uint64_t *left_y = &lfm->left_y[tx_size_y];
+  uint64_t *above_y = &lfm->above_y[tx_size_y];
+  uint64_t *int_4x4_y = &lfm->int_4x4_y;
+
+  if (!filter_level)
+    return;
+
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+
+  if (skip && ref > INTRA_FRAME)
+    return;
+
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  if (tx_size_y == TX_4X4) {
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+  }
+}
+
+// This function sets up the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col.
+// TODO(JBB): This function only works for yv12.
+static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
+                       MODE_INFO **mi_8x8, const int mode_info_stride,
+                       LOOP_FILTER_MASK *lfm) {
+  int idx_32, idx_16, idx_8;
+  const loop_filter_info_n *const lfi_n = &cm->lf_info;
+  MODE_INFO **mip = mi_8x8;
+  MODE_INFO **mip2 = mi_8x8;
+
+  // These are offsets to the next mi in the 64x64 block. It is what gets
+  // added to the mi ptr as we go through each loop.  It helps us to avoids
+  // setting up special row and column counters for each index.  The last step
+  // brings us out back to the starting position.
+  const int offset_32[] = {4, (mode_info_stride << 2) - 4, 4,
+                           -(mode_info_stride << 2) - 4};
+  const int offset_16[] = {2, (mode_info_stride << 1) - 2, 2,
+                           -(mode_info_stride << 1) - 2};
+  const int offset[] = {1, mode_info_stride - 1, 1, -mode_info_stride - 1};
+
+  // Following variables represent shifts to position the current block
+  // mask over the appropriate block.   A shift of 36 to the left will move
+  // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left
+  // 4 rows to the appropriate spot.
+  const int shift_32_y[] = {0, 4, 32, 36};
+  const int shift_16_y[] = {0, 2, 16, 18};
+  const int shift_8_y[] = {0, 1, 8, 9};
+  const int shift_32_uv[] = {0, 2, 8, 10};
+  const int shift_16_uv[] = {0, 1, 4, 5};
+  int i;
+  const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ?
+                        cm->mi_rows - mi_row : MI_BLOCK_SIZE);
+  const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
+                        cm->mi_cols - mi_col : MI_BLOCK_SIZE);
+
+  vp9_zero(*lfm);
+
+  // TODO(jimbankoski): Try moving most of the following code into decode
+  // loop and storing lfm in the mbmi structure so that we don't have to go
+  // through the recursive loop structure multiple times.
+  switch (mip[0]->mbmi.sb_type) {
+    case BLOCK_64X64:
+      build_masks(lfi_n, mip[0] , 0, 0, lfm);
+      break;
+    case BLOCK_64X32:
+      build_masks(lfi_n, mip[0], 0, 0, lfm);
+      mip2 = mip + mode_info_stride * 4;
+      if (4 >= max_rows)
+        break;
+      build_masks(lfi_n, mip2[0], 32, 8, lfm);
+      break;
+    case BLOCK_32X64:
+      build_masks(lfi_n, mip[0], 0, 0, lfm);
+      mip2 = mip + 4;
+      if (4 >= max_cols)
+        break;
+      build_masks(lfi_n, mip2[0], 4, 2, lfm);
+      break;
+    default:
+      for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
+        const int shift_y = shift_32_y[idx_32];
+        const int shift_uv = shift_32_uv[idx_32];
+        const int mi_32_col_offset = ((idx_32 & 1) << 2);
+        const int mi_32_row_offset = ((idx_32 >> 1) << 2);
+        if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
+          continue;
+        switch (mip[0]->mbmi.sb_type) {
+          case BLOCK_32X32:
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            break;
+          case BLOCK_32X16:
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            if (mi_32_row_offset + 2 >= max_rows)
+              continue;
+            mip2 = mip + mode_info_stride * 2;
+            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
+            break;
+          case BLOCK_16X32:
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            if (mi_32_col_offset + 2 >= max_cols)
+              continue;
+            mip2 = mip + 2;
+            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
+            break;
+          default:
+            for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
+              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
+              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
+              const int mi_16_col_offset = mi_32_col_offset +
+                  ((idx_16 & 1) << 1);
+              const int mi_16_row_offset = mi_32_row_offset +
+                  ((idx_16 >> 1) << 1);
+
+              if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows)
+                continue;
+
+              switch (mip[0]->mbmi.sb_type) {
+                case BLOCK_16X16:
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  break;
+                case BLOCK_16X8:
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  if (mi_16_row_offset + 1 >= max_rows)
+                    continue;
+                  mip2 = mip + mode_info_stride;
+                  build_y_mask(lfi_n, mip2[0], shift_y+8, lfm);
+                  break;
+                case BLOCK_8X16:
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  if (mi_16_col_offset +1 >= max_cols)
+                    continue;
+                  mip2 = mip + 1;
+                  build_y_mask(lfi_n, mip2[0], shift_y+1, lfm);
+                  break;
+                default: {
+                  const int shift_y = shift_32_y[idx_32] +
+                                      shift_16_y[idx_16] +
+                                      shift_8_y[0];
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  mip += offset[0];
+                  for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
+                    const int shift_y = shift_32_y[idx_32] +
+                                        shift_16_y[idx_16] +
+                                        shift_8_y[idx_8];
+                    const int mi_8_col_offset = mi_16_col_offset +
+                        ((idx_8 & 1));
+                    const int mi_8_row_offset = mi_16_row_offset +
+                        ((idx_8 >> 1));
+
+                    if (mi_8_col_offset >= max_cols ||
+                        mi_8_row_offset >= max_rows)
+                      continue;
+                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
+                  }
+                  break;
+                }
+              }
+            }
+            break;
+        }
+      }
+      break;
+  }
+  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
+  // for 32x32 transforms also also.
+  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
+  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
+  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
+  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
+
+  // We do at least 8 tap filter on every 32x32 even if the transform size
+  // is 4x4.  So if the 4x4 is set on a border pixel add it to the 8x8 and
+  // remove it from the 4x4.
+  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
+  lfm->left_y[TX_4X4] &= ~left_border;
+  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
+  lfm->above_y[TX_4X4] &= ~above_border;
+  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
+  lfm->left_uv[TX_4X4] &= ~left_border_uv;
+  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
+  lfm->above_uv[TX_4X4] &= ~above_border_uv;
+
+  // We do some special edge handling.
+  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
+    const uint64_t rows = cm->mi_rows - mi_row;
+
+    // Each pixel inside the border gets a 1,
+    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
+    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
+
+    // Remove values completely outside our border.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+    lfm->int_4x4_uv &= mask_uv;
+
+    // We don't apply a wide loop filter on the last uv block row.  If set
+    // apply the shorter one instead.
+    if (rows == 1) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
+      lfm->above_uv[TX_16X16] = 0;
+    }
+    if (rows == 5) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
+      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
+    }
+  }
+
+  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
+    const uint64_t columns = cm->mi_cols - mi_col;
+
+    // Each pixel inside the border gets a 1, the multiply copies the border
+    // to where we need it.
+    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101;
+    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
+
+    // Internal edges are not applied on the last column of the image so
+    // we mask 1 more for the internal edges
+    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
+
+    // Remove the bits outside the image edge.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+    lfm->int_4x4_uv &= mask_uv_int;
+
+    // We don't apply a wide loop filter on the last uv column.  If set
+    // apply the shorter one instead.
+    if (columns == 1) {
+      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
+      lfm->left_uv[TX_16X16] = 0;
+    }
+    if (columns == 5) {
+      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
+      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
+    }
+  }
+  // We don't a loop filter on the first column in the image.  Mask that out.
+  if (mi_col == 0) {
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= 0xfefefefefefefefe;
+      lfm->left_uv[i] &= 0xeeee;
+    }
+  }
+}
+#if CONFIG_NON420
+static void filter_block_plane_non420(VP9_COMMON *cm,
+                                      struct macroblockd_plane *plane,
+                                      MODE_INFO **mi_8x8,
+                                      int mi_row, int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
   const int row_step = 1 << ss_x;
@@ -262,24 +816,25 @@ static void filter_block_plane(VP9_COMMON *const cm,
 
     // Determine the vertical edges that need filtering
     for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
-      const int skip_this = mi[c].mbmi.mb_skip_coeff
-                            && is_inter_block(&mi[c].mbmi);
+      const MODE_INFO *mi = mi_8x8[c];
+      const int skip_this = mi[0].mbmi.skip_coeff
+                            && is_inter_block(&mi[0].mbmi);
       // left edge of current unit is block/partition edge -> no skip
-      const int block_edge_left = b_width_log2(mi[c].mbmi.sb_type) ?
-          !(c & ((1 << (b_width_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
+      const int block_edge_left = b_width_log2(mi[0].mbmi.sb_type) ?
+          !(c & ((1 << (b_width_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1;
       const int skip_this_c = skip_this && !block_edge_left;
       // top edge of current unit is block/partition edge -> no skip
-      const int block_edge_above = b_height_log2(mi[c].mbmi.sb_type) ?
-          !(r & ((1 << (b_height_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
+      const int block_edge_above = b_height_log2(mi[0].mbmi.sb_type) ?
+          !(r & ((1 << (b_height_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1;
       const int skip_this_r = skip_this && !block_edge_above;
       const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                            ? get_uv_tx_size(&mi[c].mbmi)
-                            : mi[c].mbmi.txfm_size;
+                            ? get_uv_tx_size(&mi[0].mbmi)
+                            : mi[0].mbmi.tx_size;
       const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
       // Filter level can vary per MI
-      if (!build_lfi(&cm->lf_info, &mi[c].mbmi, lfi[r] + (c >> ss_x)))
+      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
         continue;
 
       // Build masks based on the transform size of each block
@@ -338,7 +893,7 @@ static void filter_block_plane(VP9_COMMON *const cm,
                             mask_4x4_c & border_mask,
                             mask_4x4_int[r], lfi[r]);
     dst->buf += 8 * dst->stride;
-    mi += row_step_stride;
+    mi_8x8 += row_step_stride;
   }
 
   // Now do horizontal pass
@@ -355,33 +910,146 @@ static void filter_block_plane(VP9_COMMON *const cm,
     dst->buf += 8 * dst->stride;
   }
 }
+#endif
+
+static void filter_block_plane(VP9_COMMON *const cm,
+                               struct macroblockd_plane *const plane,
+                               MODE_INFO **mi_8x8,
+                               int mi_row, int mi_col,
+                               LOOP_FILTER_MASK *lfm) {
+  const int ss_x = plane->subsampling_x;
+  const int ss_y = plane->subsampling_y;
+  const int row_step = 1 << ss_x;
+  const int col_step = 1 << ss_y;
+  const int row_step_stride = cm->mode_info_stride * row_step;
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t* const dst0 = dst->buf;
+  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
+  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  int r, c;
+  int row_shift = 3 - ss_x;
+  int row_mask = 0xff >> (ss_x << 2);
+
+#define MASK_ROW(value) ((value >> (r_sampled << row_shift)) & row_mask)
+
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+    int r_sampled = r >> ss_x;
+
+    // Determine the vertical edges that need filtering
+    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
+      const MODE_INFO *mi = mi_8x8[c];
+      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
+        continue;
+    }
+    if (!plane->plane_type) {
+      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
+      // Disable filtering on the leftmost column
+      filter_selectively_vert(dst->buf, dst->stride,
+                              MASK_ROW(lfm->left_y[TX_16X16]),
+                              MASK_ROW(lfm->left_y[TX_8X8]),
+                              MASK_ROW(lfm->left_y[TX_4X4]),
+                              MASK_ROW(lfm->int_4x4_y),
+                              lfi[r]);
+    } else {
+      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_uv);
+      // Disable filtering on the leftmost column
+      filter_selectively_vert(dst->buf, dst->stride,
+                              MASK_ROW(lfm->left_uv[TX_16X16]),
+                              MASK_ROW(lfm->left_uv[TX_8X8]),
+                              MASK_ROW(lfm->left_uv[TX_4X4]),
+                              MASK_ROW(lfm->int_4x4_uv),
+                              lfi[r]);
+    }
+    dst->buf += 8 * dst->stride;
+    mi_8x8 += row_step_stride;
+  }
+
+  // Now do horizontal pass
+  dst->buf = dst0;
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+    const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
+    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
+    int r_sampled = r >> ss_x;
+
+    if (!plane->plane_type) {
+      filter_selectively_horiz(dst->buf, dst->stride,
+                               MASK_ROW(lfm->above_y[TX_16X16]),
+                               MASK_ROW(lfm->above_y[TX_8X8]),
+                               MASK_ROW(lfm->above_y[TX_4X4]),
+                               MASK_ROW(lfm->int_4x4_y),
+                               mi_row + r == 0, lfi[r]);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride,
+                               MASK_ROW(lfm->above_uv[TX_16X16]),
+                               MASK_ROW(lfm->above_uv[TX_8X8]),
+                               MASK_ROW(lfm->above_uv[TX_4X4]),
+                               mask_4x4_int_r,
+                               mi_row + r == 0, lfi[r]);
+    }
+    dst->buf += 8 * dst->stride;
+  }
+#undef MASK_ROW
+}
 
 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
                           VP9_COMMON *cm, MACROBLOCKD *xd,
                           int start, int stop, int y_only) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
+  LOOP_FILTER_MASK lfm;
+#if CONFIG_NON420
+  int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
+      xd->plane[1].subsampling_x == 1);
+#endif
 
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
-    MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
+    MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride;
 
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
       int plane;
 
       setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+#if CONFIG_NON420
+      if (use_420)
+#endif
+        setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride,
+                   &lfm);
+
       for (plane = 0; plane < num_planes; ++plane) {
-        filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col);
+#if CONFIG_NON420
+        if (use_420)
+#endif
+          filter_block_plane(cm, &xd->plane[plane], mi_8x8 + mi_col, mi_row,
+                             mi_col, &lfm);
+#if CONFIG_NON420
+        else
+          filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
+                                    mi_row, mi_col);
+#endif
       }
     }
   }
 }
 
 void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
-                           int frame_filter_level, int y_only) {
+                           int frame_filter_level,
+                           int y_only, int partial) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
   if (!frame_filter_level) return;
-  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  vp9_loop_filter_frame_init(cm, frame_filter_level);
   vp9_loop_filter_rows(cm->frame_to_show, cm, xd,
-                       0, cm->mi_rows, y_only);
+                       start_mi_row, end_mi_row,
+                       y_only);
 }
 
 int vp9_loop_filter_worker(void *arg1, void *arg2) {
diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h
index 5fc9094..91d40ac 100644
--- a/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libvpx/vp9/common/vp9_loopfilter.h
@@ -22,6 +22,27 @@
 
 #define SIMD_WIDTH 16
 
+#define MAX_REF_LF_DELTAS       4
+#define MAX_MODE_LF_DELTAS      2
+
+struct loopfilter {
+  int filter_level;
+
+  int sharpness_level;
+  int last_sharpness_level;
+
+  uint8_t mode_ref_delta_enabled;
+  uint8_t mode_ref_delta_update;
+
+  // 0 = Intra, Last, GF, ARF
+  signed char ref_deltas[MAX_REF_LF_DELTAS];
+  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
+
+  // 0 = ZERO_MV, MV
+  signed char mode_deltas[MAX_MODE_LF_DELTAS];
+  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
+};
+
 // Need to align this structure so when it is declared and
 // passed it can be loaded into vector registers.
 typedef struct {
@@ -39,19 +60,17 @@ typedef struct {
 struct VP9Common;
 struct macroblockd;
 
-void vp9_loop_filter_init(struct VP9Common *cm, struct loopfilter *lf);
+void vp9_loop_filter_init(struct VP9Common *cm);
 
 // Update the loop filter for the current frame.
 // This should be called before vp9_loop_filter_rows(), vp9_loop_filter_frame()
 // calls this function directly.
-void vp9_loop_filter_frame_init(struct VP9Common *const cm,
-                                struct macroblockd *const xd,
-                                int default_filt_lvl);
+void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl);
 
 void vp9_loop_filter_frame(struct VP9Common *cm,
                            struct macroblockd *mbd,
                            int filter_level,
-                           int y_only);
+                           int y_only, int partial);
 
 // Apply the loop filter to [start, stop) macro block rows in frame_buffer.
 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c
index 3b72f41..bfeeb57 100644
--- a/libvpx/vp9/common/vp9_mvref_common.c
+++ b/libvpx/vp9/common/vp9_mvref_common.c
@@ -1,3 +1,4 @@
+
 /*
  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
  *
@@ -36,7 +37,7 @@ static const int mode_2_counter[MB_MODE_COUNT] = {
   9,  // D135_PRED
   9,  // D117_PRED
   9,  // D153_PRED
-  9,  // D27_PRED
+  9,  // D207_PRED
   9,  // D63_PRED
   9,  // TM_PRED
   0,  // NEARESTMV
@@ -70,33 +71,33 @@ static const int counter_to_context[19] = {
   BOTH_INTRA  // 18
 };
 
-static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
-  // SB4X4
-  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
-  // SB4X8
-  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
-  // SB8X4
-  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
-  // SB8X8
-  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
-  // SB8X16
-  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
-  // SB16X8
+static const MV mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
+  // 4X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 4X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X16
   {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
-  // SB16X16
-  {{0, -1}, {-1, 0}, {1, -1}, {-1, 1}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}},
-  // SB16X32
-  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // SB32X16
+  // 16X8
+  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
+  // 16X16
+  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 16X32
   {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
-  // SB32X32
-  {{1, -1}, {-1, 1}, {2, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}},
-  // SB32X64
-  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
-  // SB64X32
+  // 32X16
+  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X32
+  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X64
   {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
-  // SB64X64
-  {{3, -1}, {-1, 3}, {4, -1}, {-1, 4}, {-1, -1}, {0, -1}, {-1, 0}, {6, -1}}
+  // 64X32
+  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
+  // 64X64
+  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
 };
 
 static const int idx_n_column_to_subblock[4][2] = {
@@ -109,11 +110,11 @@ static const int idx_n_column_to_subblock[4][2] = {
 // clamp_mv_ref
 #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
 
-static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) {
-  clamp_mv(&mv->as_mv, xd->mb_to_left_edge - MV_BORDER,
-                       xd->mb_to_right_edge + MV_BORDER,
-                       xd->mb_to_top_edge - MV_BORDER,
-                       xd->mb_to_bottom_edge + MV_BORDER);
+static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
+               xd->mb_to_right_edge + MV_BORDER,
+               xd->mb_to_top_edge - MV_BORDER,
+               xd->mb_to_bottom_edge + MV_BORDER);
 }
 
 // This function returns either the appropriate sub block or block's mv
@@ -121,89 +122,75 @@ static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) {
 static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate,
                                       int check_sub_blocks, int which_mv,
                                       int search_col, int block_idx) {
-  return (check_sub_blocks && candidate->mbmi.sb_type < BLOCK_SIZE_SB8X8
+  return check_sub_blocks && candidate->mbmi.sb_type < BLOCK_8X8
           ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
               .as_mv[which_mv]
-          : candidate->mbmi.mv[which_mv]);
+          : candidate->mbmi.mv[which_mv];
 }
 
 
 // Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MODE_INFO *candidate, const int which_mv,
+static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
                               const MV_REFERENCE_FRAME this_ref_frame,
                               const int *ref_sign_bias) {
-  int_mv return_mv = candidate->mbmi.mv[which_mv];
-
-  // Sign inversion where appropriate.
-  if (ref_sign_bias[candidate->mbmi.ref_frame[which_mv]] !=
-      ref_sign_bias[this_ref_frame]) {
-    return_mv.as_mv.row *= -1;
-    return_mv.as_mv.col *= -1;
+  int_mv mv = mbmi->mv[ref];
+  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
+    mv.as_mv.row *= -1;
+    mv.as_mv.col *= -1;
   }
-  return return_mv;
+  return mv;
 }
 
 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector it will also
 // skip all additional processing and jump to done!
 #define ADD_MV_REF_LIST(MV) \
-  if (refmv_count) { \
-    if ((MV).as_int != mv_ref_list[0].as_int) { \
-      mv_ref_list[refmv_count] = (MV); \
-      goto Done; \
+  do { \
+    if (refmv_count) { \
+      if ((MV).as_int != mv_ref_list[0].as_int) { \
+        mv_ref_list[refmv_count] = (MV); \
+        goto Done; \
+      } \
+    } else { \
+      mv_ref_list[refmv_count++] = (MV); \
     } \
-  } else { \
-    mv_ref_list[refmv_count++] = (MV); \
-  }
+  } while (0)
 
 // If either reference frame is different, not INTRA, and they
 // are different from each other scale and add the mv to our list.
 #define IF_DIFF_REF_FRAME_ADD_MV(CANDIDATE) \
-  if ((CANDIDATE)->mbmi.ref_frame[0] != ref_frame) { \
-    ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \
-  } \
-  if ((CANDIDATE)->mbmi.ref_frame[1] != ref_frame && \
-      (CANDIDATE)->mbmi.ref_frame[1] > INTRA_FRAME && \
-      (CANDIDATE)->mbmi.mv[1].as_int != (CANDIDATE)->mbmi.mv[0].as_int) { \
-    ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \
-  }
+  do { \
+    if ((CANDIDATE)->ref_frame[0] != ref_frame) \
+      ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \
+    if ((CANDIDATE)->ref_frame[1] != ref_frame && \
+        has_second_ref(CANDIDATE) && \
+        (CANDIDATE)->mv[1].as_int != (CANDIDATE)->mv[0].as_int) \
+      ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \
+  } while (0)
+
 
 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
-static INLINE int is_inside(const int mi_col, const int mi_row,
-                            const int cur_tile_mi_col_start,
-                            const int cur_tile_mi_col_end, const int mi_rows,
-                            const int (*mv_ref_search)[2], int idx) {
-  int mi_search_col;
-  const int mi_search_row = mi_row + mv_ref_search[idx][1];;
-
-  // Check that the candidate is within the border.  We only need to check
-  // the left side because all the positive right side ones are for blocks that
-  // are large enough to support the + value they have within their border.
-  if (mi_search_row < 0)
-    return 0;
-
-  mi_search_col = mi_col + mv_ref_search[idx][0];
-  if (mi_search_col < cur_tile_mi_col_start)
-    return 0;
-
-  return 1;
+static INLINE int is_inside(const VP9_COMMON *cm, int mi_col, int mi_row,
+                            const MV *mv) {
+  return !(mi_row + mv->row < 0 ||
+           mi_col + mv->col < cm->cur_tile_mi_col_start ||
+           mi_row + mv->row >= cm->mi_rows ||
+           mi_col + mv->col >= cm->cur_tile_mi_col_end);
 }
 
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
-void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
-                          const MODE_INFO *lf_here,
-                          const MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list, const int *ref_sign_bias,
-                          const int block_idx,
-                          const int mi_row, const int mi_col) {
-  int idx;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  int refmv_count = 0;
-  const int (*mv_ref_search)[2] = mv_ref_blocks[mbmi->sb_type];
-  const MODE_INFO *candidate;
-  const int check_sub_blocks = block_idx >= 0;
+void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                          MODE_INFO *mi, const MODE_INFO *prev_mi,
+                          MV_REFERENCE_FRAME ref_frame,
+                          int_mv *mv_ref_list,
+                          int block_idx,
+                          int mi_row, int mi_col) {
+  const int *ref_sign_bias = cm->ref_frame_sign_bias;
+  int i, refmv_count = 0;
+  const MV *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
   int different_ref_found = 0;
   int context_counter = 0;
 
@@ -213,28 +200,27 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
   // The nearest 2 blocks are treated differently
   // if the size < 8x8 we get the mv from the bmi substructure,
   // and we also need to keep a mode count.
-  for (idx = 0; idx < 2; ++idx) {
-    if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start,
-                   cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx))
-      continue;
-
-    candidate = here + mv_ref_search[idx][0]
-                + mv_ref_search[idx][1] * xd->mode_info_stride;
-
-    // Keep counts for entropy encoding.
-    context_counter += mode_2_counter[candidate->mbmi.mode];
-
-    // Check if the candidate comes from the same reference frame.
-    if (candidate->mbmi.ref_frame[0] == ref_frame) {
-      ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 0,
-                                       mv_ref_search[idx][0], block_idx));
-      different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
-    } else {
-      different_ref_found = 1;
-      if (candidate->mbmi.ref_frame[1] == ref_frame) {
-        // Add second motion vector if it has the same ref_frame.
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 1,
-                                         mv_ref_search[idx][0], block_idx));
+  for (i = 0; i < 2; ++i) {
+    const MV *const mv_ref = &mv_ref_search[i];
+    if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+      const int check_sub_blocks = block_idx >= 0;
+      const MODE_INFO *const candidate_mi = xd->mi_8x8[mv_ref->col + mv_ref->row
+                                                   * xd->mode_info_stride];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];
+
+      // Check if the candidate comes from the same reference frame.
+      if (candidate->ref_frame[0] == ref_frame) {
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 0,
+                                         mv_ref->col, block_idx));
+        different_ref_found = candidate->ref_frame[1] != ref_frame;
+      } else {
+        if (candidate->ref_frame[1] == ref_frame)
+          // Add second motion vector if it has the same ref_frame.
+          ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 1,
+                                           mv_ref->col, block_idx));
+        different_ref_found = 1;
       }
     }
   }
@@ -242,68 +228,59 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
   // Check the rest of the neighbors in much the same way
   // as before except we don't need to keep track of sub blocks or
   // mode counts.
-  for (; idx < MVREF_NEIGHBOURS; ++idx) {
-    if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start,
-                   cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx))
-      continue;
-
-    candidate = here + mv_ref_search[idx][0]
-                + mv_ref_search[idx][1] * xd->mode_info_stride;
-
-    if (candidate->mbmi.ref_frame[0] == ref_frame) {
-      ADD_MV_REF_LIST(candidate->mbmi.mv[0]);
-      different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
-    } else {
-      different_ref_found = 1;
-      if (candidate->mbmi.ref_frame[1] == ref_frame) {
-        ADD_MV_REF_LIST(candidate->mbmi.mv[1]);
+  for (; i < MVREF_NEIGHBOURS; ++i) {
+    const MV *const mv_ref = &mv_ref_search[i];
+    if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+      const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
+                                            mv_ref->row
+                                            * xd->mode_info_stride]->mbmi;
+
+      if (candidate->ref_frame[0] == ref_frame) {
+        ADD_MV_REF_LIST(candidate->mv[0]);
+        different_ref_found = candidate->ref_frame[1] != ref_frame;
+      } else {
+        if (candidate->ref_frame[1] == ref_frame)
+          ADD_MV_REF_LIST(candidate->mv[1]);
+        different_ref_found = 1;
       }
     }
   }
 
   // Check the last frame's mode and mv info.
-  if (lf_here != NULL) {
-    if (lf_here->mbmi.ref_frame[0] == ref_frame) {
-      ADD_MV_REF_LIST(lf_here->mbmi.mv[0]);
-    } else if (lf_here->mbmi.ref_frame[1] == ref_frame) {
-      ADD_MV_REF_LIST(lf_here->mbmi.mv[1]);
-    }
+  if (prev_mbmi) {
+    if (prev_mbmi->ref_frame[0] == ref_frame)
+      ADD_MV_REF_LIST(prev_mbmi->mv[0]);
+    else if (prev_mbmi->ref_frame[1] == ref_frame)
+      ADD_MV_REF_LIST(prev_mbmi->mv[1]);
   }
 
   // Since we couldn't find 2 mvs from the same reference frame
   // go back through the neighbors and find motion vectors from
   // different reference frames.
   if (different_ref_found) {
-    for (idx = 0; idx < MVREF_NEIGHBOURS; ++idx) {
-      if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start,
-                     cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx))
-        continue;
-
-      candidate = here + mv_ref_search[idx][0]
-                  + mv_ref_search[idx][1] * xd->mode_info_stride;
-
-      // If the candidate is INTRA we don't want to consider its mv.
-      if (candidate->mbmi.ref_frame[0] == INTRA_FRAME)
-        continue;
-
-      IF_DIFF_REF_FRAME_ADD_MV(candidate);
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      const MV *mv_ref = &mv_ref_search[i];
+      if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+        const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
+                                                          mv_ref->row
+                                              * xd->mode_info_stride]->mbmi;
+
+        // If the candidate is INTRA we don't want to consider its mv.
+        if (is_inter_block(candidate))
+          IF_DIFF_REF_FRAME_ADD_MV(candidate);
+      }
     }
   }
 
   // Since we still don't have a candidate we'll try the last frame.
-  if (lf_here != NULL && lf_here->mbmi.ref_frame[0] != INTRA_FRAME) {
-    IF_DIFF_REF_FRAME_ADD_MV(lf_here);
-  }
+  if (prev_mbmi && is_inter_block(prev_mbmi))
+    IF_DIFF_REF_FRAME_ADD_MV(prev_mbmi);
 
  Done:
 
-  mbmi->mb_mode_context[ref_frame] = counter_to_context[context_counter];
+  mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter];
 
   // Clamp vectors
-  for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
-    clamp_mv_ref(xd, &mv_ref_list[idx]);
-  }
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
+    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
 }
-
-#undef ADD_MV_REF_LIST
-#undef IF_DIFF_REF_FRAME_ADD_MV
diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h
index c5f89eb..39ebdb0 100644
--- a/libvpx/vp9/common/vp9_mvref_common.h
+++ b/libvpx/vp9/common/vp9_mvref_common.h
@@ -14,27 +14,20 @@
 #ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
 #define VP9_COMMON_VP9_MVREF_COMMON_H_
 
-void vp9_find_mv_refs_idx(VP9_COMMON *cm,
-                          MACROBLOCKD *xd,
-                          MODE_INFO *here,
-                          const MODE_INFO *lf_here,
-                          const MV_REFERENCE_FRAME ref_frame,
+void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                          MODE_INFO *mi, const MODE_INFO *prev_mi,
+                          MV_REFERENCE_FRAME ref_frame,
                           int_mv *mv_ref_list,
-                          const int *ref_sign_bias,
-                          const int block_idx,
-                          const int mi_row,
-                          const int mi_col);
+                          int block_idx,
+                          int mi_row, int mi_col);
 
-static INLINE void vp9_find_mv_refs(VP9_COMMON *cm,
-                                    MACROBLOCKD *xd,
-                                    MODE_INFO *here,
-                                    MODE_INFO *lf_here,
+static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                                    MODE_INFO *mi, const MODE_INFO *prev_mi,
                                     MV_REFERENCE_FRAME ref_frame,
                                     int_mv *mv_ref_list,
-                                    int *ref_sign_bias,
                                     int mi_row, int mi_col) {
-  vp9_find_mv_refs_idx(cm, xd, here, lf_here, ref_frame,
-                       mv_ref_list, ref_sign_bias, -1, mi_row, mi_col);
+  vp9_find_mv_refs_idx(cm, xd, mi, prev_mi, ref_frame,
+                       mv_ref_list, -1, mi_row, mi_col);
 }
 
 #endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_onyx.h b/libvpx/vp9/common/vp9_onyx.h
index 152046f..f424e6a 100644
--- a/libvpx/vp9/common/vp9_onyx.h
+++ b/libvpx/vp9/common/vp9_onyx.h
@@ -46,7 +46,8 @@ extern "C"
   typedef enum {
     USAGE_STREAM_FROM_SERVER    = 0x0,
     USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
-    USAGE_CONSTRAINED_QUALITY   = 0x2
+    USAGE_CONSTRAINED_QUALITY   = 0x2,
+    USAGE_CONSTANT_QUALITY      = 0x3,
   } END_USAGE;
 
 
@@ -130,6 +131,8 @@ extern "C"
     // END DATARATE CONTROL OPTIONS
     // ----------------------------------------------------------------
 
+    // Spatial scalability
+    int ss_number_layers;
 
     // these parameters aren't to be used in final build don't use!!!
     int play_alternate;
@@ -210,6 +213,13 @@ extern "C"
   int vp9_set_internal_size(VP9_PTR comp,
                             VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
 
+  int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
+                           unsigned int height);
+
+  int vp9_switch_layer(VP9_PTR comp, int layer);
+
+  void vp9_set_svc(VP9_PTR comp, int use_svc);
+
   int vp9_get_quantizer(VP9_PTR c);
 
 #ifdef __cplusplus
diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h
index 152a932..0431e14 100644
--- a/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libvpx/vp9/common/vp9_onyxc_int.h
@@ -20,7 +20,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
 
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 
@@ -38,14 +38,14 @@
 #define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LOG2)
 
 typedef struct frame_contexts {
-  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1];
-  vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
+  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
+  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
   vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
                          [PARTITION_TYPES - 1];
   vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
-  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
-                                 [VP9_SWITCHABLE_FILTERS - 1];
-  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
+  vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
+                                 [SWITCHABLE_FILTERS - 1];
+  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
   vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
   vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
   vp9_prob single_ref_prob[REF_CONTEXTS][2];
@@ -56,15 +56,15 @@ typedef struct frame_contexts {
 } FRAME_CONTEXT;
 
 typedef struct {
-  unsigned int y_mode[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES];
-  unsigned int uv_mode[VP9_INTRA_MODES][VP9_INTRA_MODES];
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
   unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
   vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
   unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
                          [COEF_BANDS][PREV_COEF_CONTEXTS];
-  unsigned int switchable_interp[VP9_SWITCHABLE_FILTERS + 1]
-                                [VP9_SWITCHABLE_FILTERS];
-  unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES];
+  unsigned int switchable_interp[SWITCHABLE_FILTERS + 1]
+                                [SWITCHABLE_FILTERS];
+  unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
   unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
   unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
   unsigned int single_ref[REF_CONTEXTS][2][2];
@@ -164,6 +164,10 @@ typedef struct VP9Common {
   MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
   MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
 
+  MODE_INFO **mi_grid_base;
+  MODE_INFO **mi_grid_visible;
+  MODE_INFO **prev_mi_grid_base;
+  MODE_INFO **prev_mi_grid_visible;
 
   // Persistent mb segment id map used in prediction.
   unsigned char *last_frame_seg_map;
@@ -176,6 +180,9 @@ typedef struct VP9Common {
 
   int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
 
+  struct loopfilter lf;
+  struct segmentation seg;
+
   /* Y,U,V */
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
@@ -198,7 +205,7 @@ typedef struct VP9Common {
   unsigned int current_video_frame;
   int version;
 
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
   struct postproc_state  postproc_state;
 #endif
 
@@ -231,7 +238,19 @@ static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
 }
 
 static int mi_cols_aligned_to_sb(int n_mis) {
-  return ALIGN_POWER_OF_TWO(n_mis, LOG2_MI_BLOCK_SIZE);
+  return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
+}
+
+static INLINE void set_skip_context(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                    int mi_row, int mi_col) {
+  const int above_idx = mi_col * 2;
+  const int left_idx = (mi_row * 2) & 15;
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    pd->above_context = cm->above_context[i] + (above_idx >> pd->subsampling_x);
+    pd->left_context = cm->left_context[i] + (left_idx >> pd->subsampling_y);
+  }
 }
 
 static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -240,25 +259,20 @@ static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
   xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
 }
 
-static int check_bsize_coverage(VP9_COMMON *cm, int mi_row, int mi_col,
-                                BLOCK_SIZE_TYPE bsize) {
-  int bsl = mi_width_log2(bsize), bs = 1 << bsl;
-  int ms = bs / 2;
+// return the node index in the prob tree for binary coding
+static int check_bsize_coverage(int bs, int mi_rows, int mi_cols,
+                                int mi_row, int mi_col) {
+  const int r = (mi_row + bs < mi_rows);
+  const int c = (mi_col + bs < mi_cols);
 
-  if ((mi_row + ms < cm->mi_rows) && (mi_col + ms < cm->mi_cols))
+  if (r && c)
     return 0;
 
-  // frame width/height are multiples of 8, hence 8x8 block should always
-  // pass the above check
-  assert(bsize > BLOCK_SIZE_SB8X8);
-
-  // return the node index in the prob tree for binary coding
-  // only allow horizontal/split partition types
-  if ((mi_col + ms < cm->mi_cols) && (mi_row + ms >= cm->mi_rows))
-    return 1;
-  // only allow vertical/split partition types
-  if ((mi_row + ms < cm->mi_rows) && (mi_col + ms >= cm->mi_cols))
-    return 2;
+  if (c && !r)
+    return 1;  // only allow horizontal/split partition types
+
+  if (r && !c)
+    return 2;  // only allow vertical/split partition types
 
   return -1;
 }
diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c
index 1157fbb..955e676 100644
--- a/libvpx/vp9/common/vp9_postproc.c
+++ b/libvpx/vp9/common/vp9_postproc.c
@@ -53,7 +53,7 @@ static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {
   { RGB_TO_YUV(0xCC33FF) },   /* Magenta */
 };
 
-static const unsigned char B_PREDICTION_MODE_colors[VP9_INTRA_MODES][3] = {
+static const unsigned char B_PREDICTION_MODE_colors[INTRA_MODES][3] = {
   { RGB_TO_YUV(0x6633ff) },   /* Purple */
   { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
   { RGB_TO_YUV(0xff33cc) },   /* Pink */
@@ -630,23 +630,21 @@ static void constrain_line(int x0, int *x1, int y0, int *y1,
   }
 }
 
-int vp9_post_proc_frame(struct VP9Common *oci,
-                        struct loopfilter *lf,
-                        YV12_BUFFER_CONFIG *dest,
-                        vp9_ppflags_t *ppflags) {
-  int q = lf->filter_level * 10 / 6;
+int vp9_post_proc_frame(struct VP9Common *cm,
+                        YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
+  int q = cm->lf.filter_level * 10 / 6;
   int flags = ppflags->post_proc_flag;
   int deblock_level = ppflags->deblocking_level;
   int noise_level = ppflags->noise_level;
 
-  if (!oci->frame_to_show)
+  if (!cm->frame_to_show)
     return -1;
 
   if (q > 63)
     q = 63;
 
   if (!flags) {
-    *dest = *oci->frame_to_show;
+    *dest = *cm->frame_to_show;
     return 0;
   }
 
@@ -655,52 +653,52 @@ int vp9_post_proc_frame(struct VP9Common *oci,
 #endif
 
   if (flags & VP9D_DEMACROBLOCK) {
-    deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
+    deblock_and_de_macro_block(cm->frame_to_show, &cm->post_proc_buffer,
                                q + (deblock_level - 5) * 10, 1, 0);
   } else if (flags & VP9D_DEBLOCK) {
-    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q);
+    vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, q);
   } else {
-    vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);
+    vp8_yv12_copy_frame(cm->frame_to_show, &cm->post_proc_buffer);
   }
 
   if (flags & VP9D_ADDNOISE) {
-    if (oci->postproc_state.last_q != q
-        || oci->postproc_state.last_noise != noise_level) {
-      fillrd(&oci->postproc_state, 63 - q, noise_level);
+    if (cm->postproc_state.last_q != q
+        || cm->postproc_state.last_noise != noise_level) {
+      fillrd(&cm->postproc_state, 63 - q, noise_level);
     }
 
-    vp9_plane_add_noise(oci->post_proc_buffer.y_buffer,
-                        oci->postproc_state.noise,
-                        oci->postproc_state.blackclamp,
-                        oci->postproc_state.whiteclamp,
-                        oci->postproc_state.bothclamp,
-                        oci->post_proc_buffer.y_width,
-                        oci->post_proc_buffer.y_height,
-                        oci->post_proc_buffer.y_stride);
+    vp9_plane_add_noise(cm->post_proc_buffer.y_buffer,
+                        cm->postproc_state.noise,
+                        cm->postproc_state.blackclamp,
+                        cm->postproc_state.whiteclamp,
+                        cm->postproc_state.bothclamp,
+                        cm->post_proc_buffer.y_width,
+                        cm->post_proc_buffer.y_height,
+                        cm->post_proc_buffer.y_stride);
   }
 
 #if 0 && CONFIG_POSTPROC_VISUALIZER
   if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
     char message[512];
     sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
-            (oci->frame_type == KEY_FRAME),
-            oci->refresh_golden_frame,
-            oci->base_qindex,
-            oci->filter_level,
+            (cm->frame_type == KEY_FRAME),
+            cm->refresh_golden_frame,
+            cm->base_qindex,
+            cm->filter_level,
             flags,
-            oci->mb_cols, oci->mb_rows);
-    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
-                  oci->post_proc_buffer.y_stride);
+            cm->mb_cols, cm->mb_rows);
+    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
+                  cm->post_proc_buffer.y_stride);
   }
 
   if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
     int i, j;
     uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
     int mb_rows = post->y_height >> 4;
     int mb_cols = post->y_width  >> 4;
     int mb_index = 0;
-    MODE_INFO *mi = oci->mi;
+    MODE_INFO *mi = cm->mi;
 
     y_ptr = post->y_buffer + 4 * post->y_stride + 4;
 
@@ -725,11 +723,11 @@ int vp9_post_proc_frame(struct VP9Common *oci,
   if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
     int i, j;
     uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
     int mb_rows = post->y_height >> 4;
     int mb_cols = post->y_width  >> 4;
     int mb_index = 0;
-    MODE_INFO *mi = oci->mi;
+    MODE_INFO *mi = cm->mi;
 
     y_ptr = post->y_buffer + 4 * post->y_stride + 4;
 
@@ -739,9 +737,9 @@ int vp9_post_proc_frame(struct VP9Common *oci,
         char zz[4];
         int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED &&
                         mi[mb_index].mbmi.mode != SPLITMV &&
-                        mi[mb_index].mbmi.mb_skip_coeff);
+                        mi[mb_index].mbmi.skip_coeff);
 
-        if (oci->frame_type == KEY_FRAME)
+        if (cm->frame_type == KEY_FRAME)
           sprintf(zz, "a");
         else
           sprintf(zz, "%c", dc_diff + '0');
@@ -761,19 +759,19 @@ int vp9_post_proc_frame(struct VP9Common *oci,
     char message[512];
     snprintf(message, sizeof(message),
              "Bitrate: %10.2f framerate: %10.2f ",
-             oci->bitrate, oci->framerate);
-    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
-                  oci->post_proc_buffer.y_stride);
+             cm->bitrate, cm->framerate);
+    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
+                  cm->post_proc_buffer.y_stride);
   }
 
   /* Draw motion vectors */
   if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
     int width  = post->y_width;
     int height = post->y_height;
-    uint8_t *y_buffer = oci->post_proc_buffer.y_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
+    uint8_t *y_buffer = cm->post_proc_buffer.y_buffer;
+    int y_stride = cm->post_proc_buffer.y_stride;
+    MODE_INFO *mi = cm->mi;
     int x0, y0;
 
     for (y0 = 0; y0 < height; y0 += 16) {
@@ -882,7 +880,7 @@ int vp9_post_proc_frame(struct VP9Common *oci,
               }
             }
           }
-        } else if (mi->mbmi.mode >= NEARESTMV) {
+        } else if (is_inter_mode(mi->mbmi.mode)) {
           MV *mv = &mi->mbmi.mv.as_mv;
           const int lx0 = x0 + 8;
           const int ly0 = y0 + 8;
@@ -910,14 +908,14 @@ int vp9_post_proc_frame(struct VP9Common *oci,
   if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
       && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
     int y, x;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
     int width  = post->y_width;
     int height = post->y_height;
-    uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
+    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
+    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
+    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
+    int y_stride = cm->post_proc_buffer.y_stride;
+    MODE_INFO *mi = cm->mi;
 
     for (y = 0; y < height; y += 16) {
       for (x = 0; x < width; x += 16) {
@@ -975,14 +973,14 @@ int vp9_post_proc_frame(struct VP9Common *oci,
   if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
       ppflags->display_ref_frame_flag) {
     int y, x;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
     int width  = post->y_width;
     int height = post->y_height;
-    uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
+    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
+    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
+    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
+    int y_stride = cm->post_proc_buffer.y_stride;
+    MODE_INFO *mi = cm->mi;
 
     for (y = 0; y < height; y += 16) {
       for (x = 0; x < width; x += 16) {
@@ -1008,12 +1006,13 @@ int vp9_post_proc_frame(struct VP9Common *oci,
   }
 #endif
 
-  *dest = oci->post_proc_buffer;
+  *dest = cm->post_proc_buffer;
 
   /* handle problem with extending borders */
-  dest->y_width = oci->width;
-  dest->y_height = oci->height;
-  dest->uv_height = dest->y_height / 2;
+  dest->y_width = cm->width;
+  dest->y_height = cm->height;
+  dest->uv_width = dest->y_width >> cm->subsampling_x;
+  dest->uv_height = dest->y_height >> cm->subsampling_y;
 
   return 0;
 }
diff --git a/libvpx/vp9/common/vp9_postproc.h b/libvpx/vp9/common/vp9_postproc.h
index a814e39..c63beae 100644
--- a/libvpx/vp9/common/vp9_postproc.h
+++ b/libvpx/vp9/common/vp9_postproc.h
@@ -26,7 +26,7 @@ struct postproc_state {
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
 
-int vp9_post_proc_frame(struct VP9Common *oci, struct loopfilter *lf,
+int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
 
 void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c
index 795962a..81fbf1f 100644
--- a/libvpx/vp9/common/vp9_pred_common.c
+++ b/libvpx/vp9/common/vp9_pred_common.c
@@ -18,50 +18,49 @@
 
 // Returns a context number for the given MB prediction signal
 unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
   // left
-  const int left_mv_pred = is_inter_mode(left_mbmi->mode);
-  const int left_interp = left_in_image && left_mv_pred ?
-          vp9_switchable_interp_map[left_mbmi->interp_filter] :
-          VP9_SWITCHABLE_FILTERS;
+  const int left_mv_pred = left_in_image ? is_inter_mode(left_mi->mbmi.mode)
+                                         : 0;
+  const int left_interp = left_in_image && left_mv_pred
+                              ? left_mi->mbmi.interp_filter
+                              : SWITCHABLE_FILTERS;
 
   // above
-  const int above_mv_pred = is_inter_mode(above_mbmi->mode);
-  const int above_interp = above_in_image && above_mv_pred ?
-          vp9_switchable_interp_map[above_mbmi->interp_filter] :
-          VP9_SWITCHABLE_FILTERS;
-
-  assert(left_interp != -1);
-  assert(above_interp != -1);
+  const int above_mv_pred = above_in_image ? is_inter_mode(above_mi->mbmi.mode)
+                                           : 0;
+  const int above_interp = above_in_image && above_mv_pred
+                               ? above_mi->mbmi.interp_filter
+                               : SWITCHABLE_FILTERS;
 
   if (left_interp == above_interp)
     return left_interp;
-  else if (left_interp == VP9_SWITCHABLE_FILTERS &&
-           above_interp != VP9_SWITCHABLE_FILTERS)
+  else if (left_interp == SWITCHABLE_FILTERS &&
+           above_interp != SWITCHABLE_FILTERS)
     return above_interp;
-  else if (left_interp != VP9_SWITCHABLE_FILTERS &&
-           above_interp == VP9_SWITCHABLE_FILTERS)
+  else if (left_interp != SWITCHABLE_FILTERS &&
+           above_interp == SWITCHABLE_FILTERS)
     return left_interp;
   else
-    return VP9_SWITCHABLE_FILTERS;
+    return SWITCHABLE_FILTERS;
 }
 // Returns a context number for the given MB prediction signal
 unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
-  const int left_intra = !is_inter_block(left_mbmi);
-  const int above_intra = !is_inter_block(above_mbmi);
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
+  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
+  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
 
   // The mode info data structure has a one element border above and to the
   // left of the entries corresponding to real macroblocks.
@@ -82,35 +81,35 @@ unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
 unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
                                                     const MACROBLOCKD *xd) {
   int pred_context;
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
+  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
   if (above_in_image && left_in_image) {  // both edges available
-    if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
-        left_mbmi->ref_frame[1] <= INTRA_FRAME)
+    if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
       // neither edge uses comp pred (0/1)
       pred_context = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^
                      (left_mbmi->ref_frame[0] == cm->comp_fixed_ref);
-    else if (above_mbmi->ref_frame[1] <= INTRA_FRAME)
+    else if (!has_second_ref(above_mbmi))
       // one of two edges uses comp pred (2/3)
       pred_context = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                          above_mbmi->ref_frame[0] == INTRA_FRAME);
-    else if (left_mbmi->ref_frame[1] <= INTRA_FRAME)
+                          !is_inter_block(above_mbmi));
+    else if (!has_second_ref(left_mbmi))
       // one of two edges uses comp pred (2/3)
       pred_context = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                          left_mbmi->ref_frame[0] == INTRA_FRAME);
+                          !is_inter_block(left_mbmi));
     else  // both edges use comp pred (4)
       pred_context = 4;
   } else if (above_in_image || left_in_image) {  // one edge available
     const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
 
-    if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+    if (!has_second_ref(edge_mbmi))
       // edge does not use comp pred (0/1)
       pred_context = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref;
     else
@@ -127,11 +126,14 @@ unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
 unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
                                               const MACROBLOCKD *xd) {
   int pred_context;
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-cm->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
+  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
+  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
@@ -140,22 +142,19 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
   const int var_ref_idx = !fix_ref_idx;
 
   if (above_in_image && left_in_image) {  // both edges available
-    if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
-        left_mbmi->ref_frame[0] == INTRA_FRAME) {  // intra/intra (2)
+    if (above_intra && left_intra) {  // intra/intra (2)
       pred_context = 2;
-    } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
-               left_mbmi->ref_frame[0] == INTRA_FRAME) {  // intra/inter
-      const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
-                                          left_mbmi : above_mbmi;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
 
-      if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)  // single pred (1/3)
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
         pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
       else  // comp pred (1/3)
         pred_context = 1 + 2 * (edge_mbmi->ref_frame[var_ref_idx]
                                     != cm->comp_var_ref[1]);
     } else {  // inter/inter
-      int l_sg = left_mbmi->ref_frame[1] <= INTRA_FRAME;
-      int a_sg = above_mbmi->ref_frame[1] <= INTRA_FRAME;
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
       MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
                                      : above_mbmi->ref_frame[var_ref_idx];
       MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
@@ -189,13 +188,15 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
   } else if (above_in_image || left_in_image) {  // one edge available
     const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
 
-    if (edge_mbmi->ref_frame[0] == INTRA_FRAME)
+    if (!is_inter_block(edge_mbmi)) {
       pred_context = 2;
-    else if (edge_mbmi->ref_frame[1] > INTRA_FRAME)
-      pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
+    } else {
+      if (has_second_ref(edge_mbmi))
+        pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
                               != cm->comp_var_ref[1]);
-    else
-      pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+      else
+        pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+    }
   } else {  // no edges available (2)
     pred_context = 2;
   }
@@ -205,91 +206,91 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
 }
 unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
   int pred_context;
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
+  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
+  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
   if (above_in_image && left_in_image) {  // both edges available
-    if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
-        left_mbmi->ref_frame[0] == INTRA_FRAME) {
+    if (above_intra && left_intra) {  // intra/intra
       pred_context = 2;
-    } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
-               left_mbmi->ref_frame[0] == INTRA_FRAME) {
-      const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
-                                          left_mbmi : above_mbmi;
-
-      if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi))
         pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
       else
         pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
                             edge_mbmi->ref_frame[1] == LAST_FRAME);
-    } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
-               left_mbmi->ref_frame[1] <= INTRA_FRAME) {
-      pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) +
-                     2 * (left_mbmi->ref_frame[0] == LAST_FRAME);
-    } else if (above_mbmi->ref_frame[1] > INTRA_FRAME &&
-               left_mbmi->ref_frame[1] > INTRA_FRAME) {
-      pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME ||
-                          above_mbmi->ref_frame[1] == LAST_FRAME ||
-                          left_mbmi->ref_frame[0] == LAST_FRAME ||
-                          left_mbmi->ref_frame[1] == LAST_FRAME);
-    } else {
-      MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ?
-              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-      MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
-              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-      MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
-              above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
-
-      if (rfs == LAST_FRAME)
-        pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-      else
-        pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;
+    } else {  // inter/inter
+      if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) {
+        pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) +
+                       2 * (left_mbmi->ref_frame[0] == LAST_FRAME);
+      } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) {
+        pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME ||
+                            above_mbmi->ref_frame[1] == LAST_FRAME ||
+                            left_mbmi->ref_frame[0] == LAST_FRAME ||
+                            left_mbmi->ref_frame[1] == LAST_FRAME);
+      } else {
+        const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ?
+                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+        const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ?
+                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+        const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ?
+                  above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+
+        if (rfs == LAST_FRAME)
+          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+        else
+          pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;
+      }
     }
   } else if (above_in_image || left_in_image) {  // one edge available
     const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
-
-    if (edge_mbmi->ref_frame[0] == INTRA_FRAME)
+    if (!is_inter_block(edge_mbmi)) {  // intra
       pred_context = 2;
-    else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
-      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-    else
-      pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
-                          edge_mbmi->ref_frame[1] == LAST_FRAME);
-  } else {  // no edges available (2)
+    } else {  // inter
+      if (!has_second_ref(edge_mbmi))
+        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+      else
+        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                            edge_mbmi->ref_frame[1] == LAST_FRAME);
+    }
+  } else {  // no edges available
     pred_context = 2;
   }
+
   assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
   return pred_context;
 }
 
 unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
   int pred_context;
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
+  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
+  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
 
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
   if (above_in_image && left_in_image) {  // both edges available
-    if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
-        left_mbmi->ref_frame[0] == INTRA_FRAME) {
+    if (above_intra && left_intra) {  // intra/intra
       pred_context = 2;
-    } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
-               left_mbmi->ref_frame[0] == INTRA_FRAME) {
-      const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
-                                          left_mbmi : above_mbmi;
-
-      if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) {
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
         if (edge_mbmi->ref_frame[0] == LAST_FRAME)
           pred_context = 3;
         else
@@ -298,54 +299,53 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
         pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
                                 edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
       }
-    } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
-               left_mbmi->ref_frame[1] <= INTRA_FRAME) {
-      if (above_mbmi->ref_frame[0] == LAST_FRAME &&
-          left_mbmi->ref_frame[0] == LAST_FRAME) {
-        pred_context = 3;
-      } else if (above_mbmi->ref_frame[0] == LAST_FRAME ||
-                 left_mbmi->ref_frame[0] == LAST_FRAME) {
-        const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == LAST_FRAME ?
-                                           left_mbmi : above_mbmi;
+    } else {  // inter/inter
+      if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) {
+        if (above_mbmi->ref_frame[0] == LAST_FRAME &&
+            left_mbmi->ref_frame[0] == LAST_FRAME) {
+          pred_context = 3;
+        } else if (above_mbmi->ref_frame[0] == LAST_FRAME ||
+                   left_mbmi->ref_frame[0] == LAST_FRAME) {
+          const MB_MODE_INFO *edge_mbmi =
+              above_mbmi->ref_frame[0] == LAST_FRAME ? left_mbmi : above_mbmi;
 
-        pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+        } else {
+          pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) +
+                         2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME);
+        }
+      } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) {
+        if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] &&
+            above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1])
+          pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                              above_mbmi->ref_frame[1] == GOLDEN_FRAME ||
+                              left_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                              left_mbmi->ref_frame[1] == GOLDEN_FRAME);
+        else
+          pred_context = 2;
       } else {
-        pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) +
-                       2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME);
+        const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ?
+                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+        const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ?
+                  above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+        const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ?
+                  above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+
+        if (rfs == GOLDEN_FRAME)
+          pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+        else if (rfs == ALTREF_FRAME)
+          pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
+        else
+          pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
       }
-    } else if (above_mbmi->ref_frame[1] > INTRA_FRAME &&
-               left_mbmi->ref_frame[1] > INTRA_FRAME) {
-      if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] &&
-          above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1])
-        pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                            above_mbmi->ref_frame[1] == GOLDEN_FRAME ||
-                            left_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                            left_mbmi->ref_frame[1] == GOLDEN_FRAME);
-      else
-        pred_context = 2;
-    } else {
-      MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ?
-              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-      MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
-              above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-      MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
-              above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
-
-      if (rfs == GOLDEN_FRAME)
-        pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
-      else if (rfs == ALTREF_FRAME)
-        pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
-      else
-        pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
     }
   } else if (above_in_image || left_in_image) {  // one edge available
     const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
 
-    if (edge_mbmi->ref_frame[0] == INTRA_FRAME ||
-        (edge_mbmi->ref_frame[0] == LAST_FRAME &&
-         edge_mbmi->ref_frame[1] <= INTRA_FRAME))
+    if (!is_inter_block(edge_mbmi) ||
+        (edge_mbmi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mbmi)))
       pred_context = 2;
-    else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+    else if (!has_second_ref(edge_mbmi))
       pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
     else
       pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
@@ -361,22 +361,23 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
 // left of the entries corresponding to real blocks.
 // The prediction flags in these dummy entries are initialized to 0.
 unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
-  const int max_tx_size = max_txsize_lookup[mi->mbmi.sb_type];
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
+  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
+  const int left_in_image = xd->left_available && left_mi;
+  const int above_in_image = xd->up_available && above_mi;
+  const int max_tx_size = max_txsize_lookup[xd->mi_8x8[0]->mbmi.sb_type];
   int above_context = max_tx_size;
   int left_context = max_tx_size;
 
   if (above_in_image)
-    above_context = above_mbmi->mb_skip_coeff ? max_tx_size
-                                              : above_mbmi->txfm_size;
+    above_context = above_mbmi->skip_coeff ? max_tx_size
+                                           : above_mbmi->tx_size;
 
   if (left_in_image)
-    left_context = left_mbmi->mb_skip_coeff ? max_tx_size
-                                            : left_mbmi->txfm_size;
+    left_context = left_mbmi->skip_coeff ? max_tx_size
+                                         : left_mbmi->tx_size;
 
   if (!left_in_image)
     left_context = above_context;
@@ -387,36 +388,17 @@ unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
   return above_context + left_context > max_tx_size;
 }
 
-void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
-                              int mi_row, int mi_col, uint8_t pred_flag) {
-  MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
-  int x, y;
-
-  for (y = 0; y < ymis; y++)
-    for (x = 0; x < xmis; x++)
-      mi[y * cm->mode_info_stride + x].mbmi.seg_id_predicted = pred_flag;
+void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) {
+  xd->this_mi->mbmi.seg_id_predicted = pred_flag;
 }
 
-void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
-                              int mi_row, int mi_col, uint8_t pred_flag) {
-  MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
-  int x, y;
-
-  for (y = 0; y < ymis; y++)
-    for (x = 0; x < xmis; x++)
-      mi[y * cm->mode_info_stride + x].mbmi.mb_skip_coeff = pred_flag;
+void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                              uint8_t pred_flag) {
+  xd->this_mi->mbmi.skip_coeff = pred_flag;
 }
 
 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
-                       BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col) {
+                       BLOCK_SIZE bsize, int mi_row, int mi_col) {
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int bw = 1 << mi_width_log2(bsize);
   const int bh = 1 << mi_height_log2(bsize);
diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h
index 238290b..47ca8ab 100644
--- a/libvpx/vp9/common/vp9_pred_common.h
+++ b/libvpx/vp9/common/vp9_pred_common.h
@@ -15,32 +15,32 @@
 #include "vp9/common/vp9_onyxc_int.h"
 
 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
-                       BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col);
+                       BLOCK_SIZE bsize, int mi_row, int mi_col);
 
 
 static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const int above_sip = above_mi ? above_mi->mbmi.seg_id_predicted : 0;
+  const int left_sip = left_mi ? left_mi->mbmi.seg_id_predicted : 0;
 
-  return above_mbmi->seg_id_predicted +
-             (xd->left_available ? left_mbmi->seg_id_predicted : 0);
+  return above_sip + (xd->left_available ? left_sip : 0);
 }
 
-static INLINE vp9_prob vp9_get_pred_prob_seg_id(const MACROBLOCKD *xd) {
-  return xd->seg.pred_probs[vp9_get_pred_context_seg_id(xd)];
+static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg,
+                                                const MACROBLOCKD *xd) {
+  return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
 }
 
-void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
-                              int mi_row, int mi_col, uint8_t pred_flag);
+void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag);
 
 static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) {
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
+  const int above_skip_coeff = above_mi ? above_mi->mbmi.skip_coeff : 0;
+  const int left_skip_coeff = left_mi ? left_mi->mbmi.skip_coeff : 0;
 
-  return above_mbmi->mb_skip_coeff +
-             (xd->left_available ? left_mbmi->mb_skip_coeff : 0);
+  return above_skip_coeff + (xd->left_available ? left_skip_coeff : 0);
 }
 
 static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
@@ -49,20 +49,14 @@ static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
 }
 
 static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) {
-  return xd->mode_info_context->mbmi.mb_skip_coeff;
+  return xd->this_mi->mbmi.skip_coeff;
 }
 
-void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
-                              int mi_row, int mi_col, uint8_t pred_flag);
+void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                              uint8_t pred_flag);
 
 unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
 
-static INLINE const vp9_prob *vp9_get_pred_probs_switchable_interp(
-    const VP9_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = vp9_get_pred_context_switchable_interp(xd);
-  return &cm->fc.switchable_interp_prob[pred_context][0];
-}
-
 unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_pred_prob_intra_inter(const VP9_COMMON *cm,
@@ -108,7 +102,7 @@ static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
 
 unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);
 
-static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context,
+static const vp9_prob *get_tx_probs(BLOCK_SIZE bsize, uint8_t context,
                                     const struct tx_probs *tx_probs) {
   if (bsize < BLOCK_16X16)
     return tx_probs->p8x8[context];
@@ -119,13 +113,14 @@ static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context,
 }
 
 static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
-                                     const struct tx_probs *tx_probs) {
-  const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+                                     const struct tx_probs *tx_probs,
+                                     const MODE_INFO *m) {
+  const BLOCK_SIZE bsize = m->mbmi.sb_type;
   const int context = vp9_get_pred_context_tx_size(xd);
   return get_tx_probs(bsize, context, tx_probs);
 }
 
-static void update_tx_counts(BLOCK_SIZE_TYPE bsize, uint8_t context,
+static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context,
                              TX_SIZE tx_size, struct tx_counts *tx_counts) {
   if (bsize >= BLOCK_32X32)
     tx_counts->p32x32[context][tx_size]++;
diff --git a/libvpx/vp9/common/vp9_quant_common.c b/libvpx/vp9/common/vp9_quant_common.c
index 48d86c5..bc40854 100644
--- a/libvpx/vp9/common/vp9_quant_common.c
+++ b/libvpx/vp9/common/vp9_quant_common.c
@@ -130,12 +130,12 @@ int16_t vp9_ac_quant(int qindex, int delta) {
 }
 
 
-int vp9_get_qindex(MACROBLOCKD *xd, int segment_id, int base_qindex) {
-  if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_ALT_Q)) {
-    const int data = vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_ALT_Q);
-    return xd->seg.abs_delta == SEGMENT_ABSDATA ?
-               data :  // Abs value
-               clamp(base_qindex + data, 0, MAXQ);  // Delta value
+int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex) {
+  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+    return seg->abs_delta == SEGMENT_ABSDATA ?
+                             data :  // Abs value
+                             clamp(base_qindex + data, 0, MAXQ);  // Delta value
   } else {
     return base_qindex;
   }
diff --git a/libvpx/vp9/common/vp9_quant_common.h b/libvpx/vp9/common/vp9_quant_common.h
index ded9426..83f2fb6 100644
--- a/libvpx/vp9/common/vp9_quant_common.h
+++ b/libvpx/vp9/common/vp9_quant_common.h
@@ -23,6 +23,6 @@ void vp9_init_quant_tables();
 int16_t vp9_dc_quant(int qindex, int delta);
 int16_t vp9_ac_quant(int qindex, int delta);
 
-int vp9_get_qindex(MACROBLOCKD *mb, int segment_id, int base_qindex);
+int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex);
 
 #endif  // VP9_COMMON_VP9_QUANT_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c
index 0b65e06..dc1d46c 100644
--- a/libvpx/vp9/common/vp9_reconinter.c
+++ b/libvpx/vp9/common/vp9_reconinter.c
@@ -10,171 +10,27 @@
 
 #include <assert.h>
 
+#include "./vpx_scale_rtcd.h"
 #include "./vpx_config.h"
+
 #include "vpx/vpx_integer.h"
+
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "./vpx_scale_rtcd.h"
-
-static int scale_value_x_with_scaling(int val,
-                                      const struct scale_factors *scale) {
-  return (val * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT);
-}
-
-static int scale_value_y_with_scaling(int val,
-                                      const struct scale_factors *scale) {
-  return (val * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT);
-}
-
-static int unscaled_value(int val, const struct scale_factors *scale) {
-  (void) scale;
-  return val;
-}
-
-static MV32 mv_q3_to_q4_with_scaling(const MV *mv,
-                                     const struct scale_factors *scale) {
-  const MV32 res = {
-    ((mv->row << 1) * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT)
-        + scale->y_offset_q4,
-    ((mv->col << 1) * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT)
-        + scale->x_offset_q4
-  };
-  return res;
-}
-
-static MV32 mv_q3_to_q4_without_scaling(const MV *mv,
-                                        const struct scale_factors *scale) {
-  const MV32 res = {
-     mv->row << 1,
-     mv->col << 1
-  };
-  return res;
-}
-
-static MV32 mv_q4_with_scaling(const MV *mv,
-                               const struct scale_factors *scale) {
-  const MV32 res = {
-    (mv->row * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->y_offset_q4,
-    (mv->col * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->x_offset_q4
-  };
-  return res;
-}
-
-static MV32 mv_q4_without_scaling(const MV *mv,
-                                  const struct scale_factors *scale) {
-  const MV32 res = {
-    mv->row,
-    mv->col
-  };
-  return res;
-}
-
-static void set_offsets_with_scaling(struct scale_factors *scale,
-                                     int row, int col) {
-  const int x_q4 = 16 * col;
-  const int y_q4 = 16 * row;
-
-  scale->x_offset_q4 = (x_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf;
-  scale->y_offset_q4 = (y_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf;
-}
-
-static void set_offsets_without_scaling(struct scale_factors *scale,
-                                        int row, int col) {
-  scale->x_offset_q4 = 0;
-  scale->y_offset_q4 = 0;
-}
 
-static int get_fixed_point_scale_factor(int other_size, int this_size) {
-  // Calculate scaling factor once for each reference frame
-  // and use fixed point scaling factors in decoding and encoding routines.
-  // Hardware implementations can calculate scale factor in device driver
-  // and use multiplication and shifting on hardware instead of division.
-  return (other_size << VP9_REF_SCALE_SHIFT) / this_size;
-}
-
-void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
-                                       int other_w, int other_h,
-                                       int this_w, int this_h) {
-  scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
-  scale->x_offset_q4 = 0;  // calculated per-mb
-  scale->x_step_q4 = (16 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT);
-
-  scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
-  scale->y_offset_q4 = 0;  // calculated per-mb
-  scale->y_step_q4 = (16 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT);
-
-  if ((other_w == this_w) && (other_h == this_h)) {
-    scale->scale_value_x = unscaled_value;
-    scale->scale_value_y = unscaled_value;
-    scale->set_scaled_offsets = set_offsets_without_scaling;
-    scale->scale_mv_q3_to_q4 = mv_q3_to_q4_without_scaling;
-    scale->scale_mv_q4 = mv_q4_without_scaling;
-  } else {
-    scale->scale_value_x = scale_value_x_with_scaling;
-    scale->scale_value_y = scale_value_y_with_scaling;
-    scale->set_scaled_offsets = set_offsets_with_scaling;
-    scale->scale_mv_q3_to_q4 = mv_q3_to_q4_with_scaling;
-    scale->scale_mv_q4 = mv_q4_with_scaling;
-  }
-
-  // TODO(agrange): Investigate the best choice of functions to use here
-  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
-  // to do at full-pel offsets. The current selection, where the filter is
-  // applied in one direction only, and not at all for 0,0, seems to give the
-  // best quality, but it may be worth trying an additional mode that does
-  // do the filtering on full-pel.
-  if (scale->x_step_q4 == 16) {
-    if (scale->y_step_q4 == 16) {
-      // No scaling in either direction.
-      scale->predict[0][0][0] = vp9_convolve_copy;
-      scale->predict[0][0][1] = vp9_convolve_avg;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
-    } else {
-      // No scaling in x direction. Must always scale in the y direction.
-      scale->predict[0][0][0] = vp9_convolve8_vert;
-      scale->predict[0][0][1] = vp9_convolve8_avg_vert;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_avg;
-    }
-  } else {
-    if (scale->y_step_q4 == 16) {
-      // No scaling in the y direction. Must always scale in the x direction.
-      scale->predict[0][0][0] = vp9_convolve8_horiz;
-      scale->predict[0][0][1] = vp9_convolve8_avg_horiz;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_avg;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
-    } else {
-      // Must always scale in both directions.
-      scale->predict[0][0][0] = vp9_convolve8;
-      scale->predict[0][0][1] = vp9_convolve8_avg;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_avg;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_avg;
-    }
-  }
-  // 2D subpel motion always gets filtered in both directions
-  scale->predict[1][1][0] = vp9_convolve8;
-  scale->predict[1][1][1] = vp9_convolve8_avg;
-}
 
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                               INTERPOLATIONFILTERTYPE mcomp_filter_type,
                               VP9_COMMON *cm) {
-  if (xd->mode_info_context) {
-    MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  if (xd->mi_8x8 && xd->this_mi) {
+    MB_MODE_INFO * mbmi = &xd->this_mi->mbmi;
 
     set_scale_factors(xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1,
                       cm->active_ref_scale);
+  } else {
+    set_scale_factors(xd, -1, -1, cm->active_ref_scale);
   }
 
   switch (mcomp_filter_type) {
@@ -199,17 +55,18 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *src_mv,
                                const struct scale_factors *scale,
-                               int w, int h, int weight,
+                               int w, int h, int ref,
                                const struct subpix_fn_table *subpix,
                                enum mv_precision precision) {
-  const MV32 mv = precision == MV_PRECISION_Q4
-                     ? scale->scale_mv_q4(src_mv, scale)
-                     : scale->scale_mv_q3_to_q4(src_mv, scale);
-  const int subpel_x = mv.col & 15;
-  const int subpel_y = mv.row & 15;
-
-  src += (mv.row >> 4) * src_stride + (mv.col >> 4);
-  scale->predict[!!subpel_x][!!subpel_y][weight](
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row << 1,
+                     is_q4 ? src_mv->col : src_mv->col << 1 };
+  const MV32 mv = scale->scale_mv(&mv_q4, scale);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;
+
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+  scale->predict[subpel_x != 0][subpel_y != 0][ref](
       src, src_stride, dst, dst_stride,
       subpix->filter_x[subpel_x], scale->x_step_q4,
       subpix->filter_y[subpel_y], scale->y_step_q4,
@@ -232,20 +89,16 @@ static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
   return res;
 }
 
-
-
 // TODO(jkoleszar): yet another mv clamping function :-(
-MV clamp_mv_to_umv_border_sb(const MV *src_mv,
-    int bwl, int bhl, int ss_x, int ss_y,
-    int mb_to_left_edge, int mb_to_top_edge,
-    int mb_to_right_edge, int mb_to_bottom_edge) {
+MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
+                             int bw, int bh, int ss_x, int ss_y) {
   // If the MV points so far into the UMV border that no visible pixels
   // are used for reconstruction, the subpel part of the MV can be
   // discarded and the MV limited to 16 pixels with equivalent results.
-  const int spel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 4;
-  const int spel_right = spel_left - (1 << 4);
-  const int spel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 4;
-  const int spel_bottom = spel_top - (1 << 4);
+  const int spel_left = (VP9_INTERP_EXTEND + bw) << SUBPEL_BITS;
+  const int spel_right = spel_left - SUBPEL_SHIFTS;
+  const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS;
+  const int spel_bottom = spel_top - SUBPEL_SHIFTS;
   MV clamped_mv = {
     src_mv->row << (1 - ss_y),
     src_mv->col << (1 - ss_x)
@@ -253,130 +106,143 @@ MV clamp_mv_to_umv_border_sb(const MV *src_mv,
   assert(ss_x <= 1);
   assert(ss_y <= 1);
 
-  clamp_mv(&clamped_mv, (mb_to_left_edge << (1 - ss_x)) - spel_left,
-                        (mb_to_right_edge << (1 - ss_x)) + spel_right,
-                        (mb_to_top_edge << (1 - ss_y)) - spel_top,
-                        (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
+  clamp_mv(&clamped_mv, (xd->mb_to_left_edge << (1 - ss_x)) - spel_left,
+                        (xd->mb_to_right_edge << (1 - ss_x)) + spel_right,
+                        (xd->mb_to_top_edge << (1 - ss_y)) - spel_top,
+                        (xd->mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
 
   return clamped_mv;
 }
 
 struct build_inter_predictors_args {
   MACROBLOCKD *xd;
-  int x;
-  int y;
-  uint8_t* dst[MAX_MB_PLANE];
-  int dst_stride[MAX_MB_PLANE];
-  uint8_t* pre[2][MAX_MB_PLANE];
-  int pre_stride[2][MAX_MB_PLANE];
+  int x, y;
 };
-static void build_inter_predictors(int plane, int block,
-                                   BLOCK_SIZE_TYPE bsize,
+
+static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
                                    int pred_w, int pred_h,
                                    void *argv) {
   const struct build_inter_predictors_args* const arg = argv;
-  MACROBLOCKD * const xd = arg->xd;
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-  const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl);
-  const MODE_INFO *const mi = xd->mode_info_context;
+  MACROBLOCKD *const xd = arg->xd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int bwl = b_width_log2(bsize) - pd->subsampling_x;
+  const int bw = 4 << bwl;
+  const int bh = plane_block_height(bsize, pd);
+  const int x = 4 * (block & ((1 << bwl) - 1));
+  const int y = 4 * (block >> bwl);
+  const MODE_INFO *mi = xd->this_mi;
   const int use_second_ref = mi->mbmi.ref_frame[1] > 0;
-  int which_mv;
+  int ref;
+
+  assert(x < bw);
+  assert(y < bh);
+  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw);
+  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh);
 
-  assert(x < (4 << bwl));
-  assert(y < (4 << bhl));
-  assert(mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 || 4 << pred_w == (4 << bwl));
-  assert(mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 || 4 << pred_h == (4 << bhl));
+  for (ref = 0; ref < 1 + use_second_ref; ++ref) {
+    struct scale_factors *const scale = &xd->scale_factor[ref];
+    struct buf_2d *const pre_buf = &pd->pre[ref];
+    struct buf_2d *const dst_buf = &pd->dst;
 
-  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-    // source
-    const uint8_t * const base_pre = arg->pre[which_mv][plane];
-    const int pre_stride = arg->pre_stride[which_mv][plane];
-    const uint8_t *const pre = base_pre +
-        scaled_buffer_offset(x, y, pre_stride, &xd->scale_factor[which_mv]);
-    struct scale_factors * const scale = &xd->scale_factor[which_mv];
+    const uint8_t *const pre = pre_buf->buf + scaled_buffer_offset(x, y,
+                               pre_buf->stride, scale);
 
-    // dest
-    uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x;
+    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
 
     // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
     // same MV (the average of the 4 luma MVs) but we could do something
     // smarter for non-4:2:0. Just punt for now, pending the changes to get
     // rid of SPLITMV mode entirely.
-    const MV mv = mi->mbmi.sb_type < BLOCK_SIZE_SB8X8
-               ? (plane == 0 ? mi->bmi[block].as_mv[which_mv].as_mv
-                             : mi_mv_pred_q4(mi, which_mv))
-               : mi->mbmi.mv[which_mv].as_mv;
+    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
+               ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv
+                             : mi_mv_pred_q4(mi, ref))
+               : mi->mbmi.mv[ref].as_mv;
 
     // TODO(jkoleszar): This clamping is done in the incorrect place for the
     // scaling case. It needs to be done on the scaled MV, not the pre-scaling
     // MV. Note however that it performs the subsampling aware scaling so
     // that the result is always q4.
-    const MV res_mv = clamp_mv_to_umv_border_sb(&mv, bwl, bhl,
-                                                xd->plane[plane].subsampling_x,
-                                                xd->plane[plane].subsampling_y,
-                                                xd->mb_to_left_edge,
-                                                xd->mb_to_top_edge,
-                                                xd->mb_to_right_edge,
-                                                xd->mb_to_bottom_edge);
+    const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                                pd->subsampling_x,
+                                                pd->subsampling_y);
+
     scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
-    vp9_build_inter_predictor(pre, pre_stride,
-                              dst, arg->dst_stride[plane],
-                              &res_mv, &xd->scale_factor[which_mv],
-                              4 << pred_w, 4 << pred_h, which_mv,
+    vp9_build_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                              &res_mv, scale,
+                              4 << pred_w, 4 << pred_h, ref,
                               &xd->subpix, MV_PRECISION_Q4);
   }
 }
-void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
-                                    int mi_row,
-                                    int mi_col,
-                                    BLOCK_SIZE_TYPE bsize) {
-  struct build_inter_predictors_args args = {
-    xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
-    {xd->plane[0].dst.buf, NULL, NULL}, {xd->plane[0].dst.stride, 0, 0},
-    {{xd->plane[0].pre[0].buf, NULL, NULL},
-     {xd->plane[0].pre[1].buf, NULL, NULL}},
-    {{xd->plane[0].pre[0].stride, 0, 0}, {xd->plane[0].pre[1].stride, 0, 0}},
-  };
 
-  foreach_predicted_block_in_plane(xd, bsize, 0, build_inter_predictors, &args);
+// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
+// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
+// sizes smaller than 16x16 yet.
+typedef void (*foreach_predicted_block_visitor)(int plane, int block,
+                                                BLOCK_SIZE bsize,
+                                                int pred_w, int pred_h,
+                                                void *arg);
+static INLINE void foreach_predicted_block_in_plane(
+    const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
+    foreach_predicted_block_visitor visit, void *arg) {
+  int i, x, y;
+
+  // block sizes in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // subsampled size of the block
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+
+  // size of the predictor to use.
+  int pred_w, pred_h;
+
+  if (xd->this_mi->mbmi.sb_type < BLOCK_8X8) {
+    assert(bsize == BLOCK_8X8);
+    pred_w = 0;
+    pred_h = 0;
+  } else {
+    pred_w = bwl;
+    pred_h = bhl;
+  }
+  assert(pred_w <= bwl);
+  assert(pred_h <= bhl);
+
+  // visit each subblock in raster order
+  i = 0;
+  for (y = 0; y < 1 << bhl; y += 1 << pred_h) {
+    for (x = 0; x < 1 << bwl; x += 1 << pred_w) {
+      visit(plane, i, bsize, pred_w, pred_h, arg);
+      i += 1 << pred_w;
+    }
+    i += (1 << (bwl + pred_h)) - (1 << bwl);
+  }
 }
-void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
-                                     int mi_row,
-                                     int mi_col,
-                                     BLOCK_SIZE_TYPE bsize) {
-  struct build_inter_predictors_args args = {
-    xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
-#if CONFIG_ALPHA
-    {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
-     xd->plane[3].dst.buf},
-    {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride,
-     xd->plane[3].dst.stride},
-    {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf,
-      xd->plane[3].pre[0].buf},
-     {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf,
-      xd->plane[3].pre[1].buf}},
-    {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride,
-      xd->plane[3].pre[0].stride},
-     {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride,
-      xd->plane[3].pre[1].stride}},
-#else
-    {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf},
-    {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride},
-    {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf},
-     {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf}},
-    {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride},
-     {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride}},
-#endif
-  };
-  foreach_predicted_block_uv(xd, bsize, build_inter_predictors, &args);
+
+static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int mi_row, int mi_col,
+                                              int plane_from, int plane_to) {
+  int plane;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    struct build_inter_predictors_args args = {
+      xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
+    };
+    foreach_predicted_block_in_plane(xd, bsize, plane, build_inter_predictors,
+                                     &args);
+  }
 }
-void vp9_build_inter_predictors_sb(MACROBLOCKD *xd,
-                                   int mi_row, int mi_col,
-                                   BLOCK_SIZE_TYPE bsize) {
 
-  vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-  vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
+}
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
+                                    MAX_MB_PLANE - 1);
+}
+void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
+                                    MAX_MB_PLANE - 1);
 }
 
 // TODO(dkovalev: find better place for this function)
@@ -391,8 +257,7 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
                                       fb->y_crop_width, fb->y_crop_height,
                                       cm->width, cm->height);
 
-    if (sf->x_scale_fp != VP9_REF_NO_SCALE ||
-        sf->y_scale_fp != VP9_REF_NO_SCALE)
+    if (vp9_is_scaled(sf))
       vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y);
   }
 }
diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h
index 6ec7323..504b793 100644
--- a/libvpx/vp9/common/vp9_reconinter.h
+++ b/libvpx/vp9/common/vp9_reconinter.h
@@ -15,28 +15,19 @@
 #include "vp9/common/vp9_onyxc_int.h"
 
 struct subpix_fn_table;
-void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
-                                    int mb_row,
-                                    int mb_col,
-                                    BLOCK_SIZE_TYPE bsize);
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize);
 
-void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
-                                     int mb_row,
-                                     int mb_col,
-                                     BLOCK_SIZE_TYPE bsize);
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize);
 
-void vp9_build_inter_predictors_sb(MACROBLOCKD *mb,
-                                   int mb_row, int mb_col,
-                                   BLOCK_SIZE_TYPE bsize);
+void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize);
 
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                               INTERPOLATIONFILTERTYPE filter,
                               VP9_COMMON *cm);
 
-void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
-                                       int other_w, int other_h,
-                                       int this_w, int this_h);
-
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *mv_q3,
diff --git a/libvpx/vp9/common/vp9_reconintra.c b/libvpx/vp9/common/vp9_reconintra.c
index f351224..4a451b9 100644
--- a/libvpx/vp9/common/vp9_reconintra.c
+++ b/libvpx/vp9/common/vp9_reconintra.c
@@ -8,14 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <stdio.h>
-
 #include "./vpx_config.h"
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
+
 #include "vp9_rtcd.h"
+
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_onyxc_int.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/vpx_once.h"
 
 const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
     DCT_DCT,    // DC
@@ -25,7 +26,7 @@ const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
     ADST_ADST,  // D135
     ADST_DCT,   // D117
     DCT_ADST,   // D153
-    DCT_ADST,   // D27
+    DCT_ADST,   // D207
     ADST_DCT,   // D63
     ADST_ADST,  // TM
     DCT_DCT,    // NEARESTMV
@@ -35,294 +36,256 @@ const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
 };
 
 #define intra_pred_sized(type, size) \
-void vp9_##type##_predictor_##size##x##size##_c(uint8_t *pred_ptr, \
-                                                ptrdiff_t stride, \
-                                                uint8_t *above_row, \
-                                                uint8_t *left_col) { \
-  type##_predictor(pred_ptr, stride, size, above_row, left_col); \
-}
+  void vp9_##type##_predictor_##size##x##size##_c(uint8_t *dst, \
+                                                  ptrdiff_t stride, \
+                                                  const uint8_t *above, \
+                                                  const uint8_t *left) { \
+    type##_predictor(dst, stride, size, above, left); \
+  }
+
 #define intra_pred_allsizes(type) \
   intra_pred_sized(type, 4) \
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
   intra_pred_sized(type, 32)
 
-static INLINE void d27_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                 uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
   int r, c;
+
   // first column
-  for (r = 0; r < bs - 1; ++r) {
-      pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] +
-                                                   left_col[r + 1], 1);
-  }
-  pred_ptr[(bs - 1) * stride] = left_col[bs - 1];
-  pred_ptr++;
+  for (r = 0; r < bs - 1; ++r)
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
   // second column
-  for (r = 0; r < bs - 2; ++r) {
-      pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] +
-                                                   left_col[r + 1] * 2 +
-                                                   left_col[r + 2], 2);
-  }
-  pred_ptr[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left_col[bs - 2] +
-                                                      left_col[bs - 1] * 3,
-                                                      2);
-  pred_ptr[(bs - 1) * stride] = left_col[bs - 1];
-  pred_ptr++;
+  for (r = 0; r < bs - 2; ++r)
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1] * 2 +
+                                         left[r + 2], 2);
+  dst[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left[bs - 2] +
+                                              left[bs - 1] * 3, 2);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
 
   // rest of last row
-  for (c = 0; c < bs - 2; ++c) {
-    pred_ptr[(bs - 1) * stride + c] = left_col[bs - 1];
-  }
+  for (c = 0; c < bs - 2; ++c)
+    dst[(bs - 1) * stride + c] = left[bs - 1];
 
-  for (r = bs - 2; r >= 0; --r) {
-    for (c = 0; c < bs - 2; ++c) {
-      pred_ptr[r * stride + c] = pred_ptr[(r + 1) * stride + c - 2];
-    }
-  }
+  for (r = bs - 2; r >= 0; --r)
+    for (c = 0; c < bs - 2; ++c)
+      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
 }
-intra_pred_allsizes(d27)
+intra_pred_allsizes(d207)
 
-static INLINE void d63_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                 uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
   int r, c;
   for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      if (r & 1) {
-        pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] +
-                                         above_row[r/2 + c + 1] * 2 +
-                                         above_row[r/2 + c + 2], 2);
-      } else {
-        pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] +
-                                         above_row[r/2+ c + 1], 1);
-      }
-    }
-    pred_ptr += stride;
+    for (c = 0; c < bs; ++c)
+      dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] +
+                                          above[r/2 + c + 1] * 2 +
+                                          above[r/2 + c + 2], 2)
+                     : ROUND_POWER_OF_TWO(above[r/2 + c] +
+                                          above[r/2 + c + 1], 1);
+    dst += stride;
   }
 }
 intra_pred_allsizes(d63)
 
-static INLINE void d45_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                 uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
   int r, c;
   for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      if (r + c + 2 < bs * 2)
-        pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r + c] +
-                                         above_row[r + c + 1] * 2 +
-                                         above_row[r + c + 2], 2);
-      else
-        pred_ptr[c] = above_row[bs * 2 - 1];
-    }
-    pred_ptr += stride;
+    for (c = 0; c < bs; ++c)
+      dst[c] = r + c + 2 < bs * 2 ?  ROUND_POWER_OF_TWO(above[r + c] +
+                                                        above[r + c + 1] * 2 +
+                                                        above[r + c + 2], 2)
+                                  : above[bs * 2 - 1];
+    dst += stride;
   }
 }
 intra_pred_allsizes(d45)
 
-static INLINE void d117_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                  uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
   int r, c;
+
   // first row
   for (c = 0; c < bs; c++)
-    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] + above_row[c], 1);
-  pred_ptr += stride;
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c], 1);
+  dst += stride;
 
   // second row
-  pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
-                                   above_row[-1] * 2 +
-                                   above_row[0], 2);
+  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
   for (c = 1; c < bs; c++)
-    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] +
-                                     above_row[c - 1] * 2 +
-                                     above_row[c], 2);
-  pred_ptr += stride;
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2);
+  dst += stride;
 
   // the rest of first col
-  pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] +
-                                   left_col[0] * 2 +
-                                   left_col[1], 2);
+  dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
   for (r = 3; r < bs; ++r)
-    pred_ptr[(r-2) * stride] = ROUND_POWER_OF_TWO(left_col[r - 3] +
-                                                  left_col[r - 2] * 2 +
-                                                  left_col[r - 1], 2);
+    dst[(r - 2) * stride] = ROUND_POWER_OF_TWO(left[r - 3] + left[r - 2] * 2 +
+                                               left[r - 1], 2);
+
   // the rest of the block
   for (r = 2; r < bs; ++r) {
     for (c = 1; c < bs; c++)
-      pred_ptr[c] = pred_ptr[-2 * stride + c - 1];
-    pred_ptr += stride;
+      dst[c] = dst[-2 * stride + c - 1];
+    dst += stride;
   }
 }
 intra_pred_allsizes(d117)
 
-static INLINE void d135_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                  uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
   int r, c;
-  pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
-                                   above_row[-1] * 2 +
-                                   above_row[0], 2);
+  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
   for (c = 1; c < bs; c++)
-    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] +
-                                     above_row[c - 1] * 2 +
-                                     above_row[c], 2);
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2);
 
-  pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] +
-                                        left_col[0] * 2 +
-                                        left_col[1], 2);
+  dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
   for (r = 2; r < bs; ++r)
-    pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] +
-                                              left_col[r - 1] * 2 +
-                                              left_col[r], 2);
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 +
+                                         left[r], 2);
 
-  pred_ptr += stride;
+  dst += stride;
   for (r = 1; r < bs; ++r) {
     for (c = 1; c < bs; c++)
-      pred_ptr[c] = pred_ptr[-stride + c - 1];
-    pred_ptr += stride;
+      dst[c] = dst[-stride + c - 1];
+    dst += stride;
   }
 }
 intra_pred_allsizes(d135)
 
-static INLINE void d153_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                  uint8_t *above_row, uint8_t *left_col) {
+static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
   int r, c;
-  pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] + left_col[0], 1);
+  dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0], 1);
   for (r = 1; r < bs; r++)
-    pred_ptr[r * stride] =
-        ROUND_POWER_OF_TWO(left_col[r - 1] + left_col[r], 1);
-  pred_ptr++;
-
-  pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
-                                   above_row[-1] * 2 +
-                                   above_row[0], 2);
-  pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] +
-                                        left_col[0] * 2 +
-                                        left_col[1], 2);
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 1] + left[r], 1);
+  dst++;
+
+  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
+  dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
   for (r = 2; r < bs; r++)
-    pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] +
-                                              left_col[r - 1] * 2 +
-                                              left_col[r], 2);
-  pred_ptr++;
+    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 +
+                                         left[r], 2);
+  dst++;
 
   for (c = 0; c < bs - 2; c++)
-    pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] +
-                                     above_row[c] * 2 +
-                                     above_row[c + 1], 2);
-  pred_ptr += stride;
+    dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c] * 2 + above[c + 1], 2);
+  dst += stride;
+
   for (r = 1; r < bs; ++r) {
     for (c = 0; c < bs - 2; c++)
-      pred_ptr[c] = pred_ptr[-stride + c - 2];
-    pred_ptr += stride;
+      dst[c] = dst[-stride + c - 2];
+    dst += stride;
   }
 }
 intra_pred_allsizes(d153)
 
-static INLINE void v_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                       uint8_t *above_row, uint8_t *left_col) {
+static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
   int r;
 
   for (r = 0; r < bs; r++) {
-    vpx_memcpy(pred_ptr, above_row, bs);
-    pred_ptr += stride;
+    vpx_memcpy(dst, above, bs);
+    dst += stride;
   }
 }
 intra_pred_allsizes(v)
 
-static INLINE void h_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                               uint8_t *above_row, uint8_t *left_col) {
+static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
   int r;
 
   for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, left_col[r], bs);
-    pred_ptr += stride;
+    vpx_memset(dst, left[r], bs);
+    dst += stride;
   }
 }
 intra_pred_allsizes(h)
 
-static INLINE void tm_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                uint8_t *above_row, uint8_t *left_col) {
+static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
   int r, c;
-  int ytop_left = above_row[-1];
+  int ytop_left = above[-1];
 
   for (r = 0; r < bs; r++) {
     for (c = 0; c < bs; c++)
-      pred_ptr[c] = clip_pixel(left_col[r] + above_row[c] - ytop_left);
-    pred_ptr += stride;
+      dst[c] = clip_pixel(left[r] + above[c] - ytop_left);
+    dst += stride;
   }
 }
 intra_pred_allsizes(tm)
 
-static INLINE void dc_128_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                    uint8_t *above_row, uint8_t *left_col) {
+static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
   int r;
 
   for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, 128, bs);
-    pred_ptr += stride;
+    vpx_memset(dst, 128, bs);
+    dst += stride;
   }
 }
 intra_pred_allsizes(dc_128)
 
-static INLINE void dc_left_predictor(uint8_t *pred_ptr, ptrdiff_t stride,
-                                     int bs,
-                                     uint8_t *above_row, uint8_t *left_col) {
-  int i, r;
-  int expected_dc = 128;
-  int average = 0;
-  const int count = bs;
+static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
 
   for (i = 0; i < bs; i++)
-    average += left_col[i];
-  expected_dc = (average + (count >> 1)) / count;
+    sum += left[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
 
   for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, expected_dc, bs);
-    pred_ptr += stride;
+    vpx_memset(dst, expected_dc, bs);
+    dst += stride;
   }
 }
 intra_pred_allsizes(dc_left)
 
-static INLINE void dc_top_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                    uint8_t *above_row, uint8_t *left_col) {
-  int i, r;
-  int expected_dc = 128;
-  int average = 0;
-  const int count = bs;
+static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
 
   for (i = 0; i < bs; i++)
-    average += above_row[i];
-  expected_dc = (average + (count >> 1)) / count;
+    sum += above[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
 
   for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, expected_dc, bs);
-    pred_ptr += stride;
+    vpx_memset(dst, expected_dc, bs);
+    dst += stride;
   }
 }
 intra_pred_allsizes(dc_top)
 
-static INLINE void dc_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
-                                uint8_t *above_row, uint8_t *left_col) {
-  int i, r;
-  int expected_dc = 128;
-  int average = 0;
+static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
   const int count = 2 * bs;
 
-  for (i = 0; i < bs; i++)
-    average += above_row[i];
-  for (i = 0; i < bs; i++)
-    average += left_col[i];
-  expected_dc = (average + (count >> 1)) / count;
+  for (i = 0; i < bs; i++) {
+    sum += above[i];
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;
 
   for (r = 0; r < bs; r++) {
-    vpx_memset(pred_ptr, expected_dc, bs);
-    pred_ptr += stride;
+    vpx_memset(dst, expected_dc, bs);
+    dst += stride;
   }
 }
 intra_pred_allsizes(dc)
 #undef intra_pred_allsizes
 
-typedef void (*intra_pred_fn)(uint8_t *pred_ptr, ptrdiff_t stride,
-                              uint8_t *above_row, uint8_t *left_col);
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);
 
-static intra_pred_fn pred[VP9_INTRA_MODES][4];
+static intra_pred_fn pred[INTRA_MODES][4];
 static intra_pred_fn dc_pred[2][2][4];
 
 static void init_intra_pred_fn_ptrs(void) {
@@ -334,7 +297,7 @@ static void init_intra_pred_fn_ptrs(void) {
 
   intra_pred_allsizes(pred[V_PRED], v);
   intra_pred_allsizes(pred[H_PRED], h);
-  intra_pred_allsizes(pred[D27_PRED], d27);
+  intra_pred_allsizes(pred[D207_PRED], d207);
   intra_pred_allsizes(pred[D45_PRED], d45);
   intra_pred_allsizes(pred[D63_PRED], d63);
   intra_pred_allsizes(pred[D117_PRED], d117);
@@ -350,16 +313,17 @@ static void init_intra_pred_fn_ptrs(void) {
 #undef intra_pred_allsizes
 }
 
-static void build_intra_predictors(uint8_t *src, int src_stride,
-                                   uint8_t *pred_ptr, int stride,
-                                   MB_PREDICTION_MODE mode, TX_SIZE txsz,
+static void build_intra_predictors(const uint8_t *ref, int ref_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   MB_PREDICTION_MODE mode, TX_SIZE tx_size,
                                    int up_available, int left_available,
                                    int right_available) {
   int i;
   DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, yabove_data, 128 + 16);
-  uint8_t *above_row = yabove_data + 16;
-  const int bs = 4 << txsz;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16);
+  uint8_t *above_row = above_data + 16;
+  const uint8_t *const_above_row = above_row;
+  const int bs = 4 << tx_size;
 
   // 127 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
@@ -369,45 +333,46 @@ static void build_intra_predictors(uint8_t *src, int src_stride,
   // ..
 
   once(init_intra_pred_fn_ptrs);
+
+  // left
   if (left_available) {
     for (i = 0; i < bs; i++)
-      left_col[i] = src[i * src_stride - 1];
+      left_col[i] = ref[i * ref_stride - 1];
   } else {
     vpx_memset(left_col, 129, bs);
   }
 
+  // above
   if (up_available) {
-    uint8_t *above_ptr = src - src_stride;
+    const uint8_t *above_ref = ref - ref_stride;
     if (bs == 4 && right_available && left_available) {
-      above_row = above_ptr;
+      const_above_row = above_ref;
     } else {
-      vpx_memcpy(above_row, above_ptr, bs);
+      vpx_memcpy(above_row, above_ref, bs);
       if (bs == 4 && right_available)
-        vpx_memcpy(above_row + bs, above_ptr + bs, bs);
+        vpx_memcpy(above_row + bs, above_ref + bs, bs);
       else
         vpx_memset(above_row + bs, above_row[bs - 1], bs);
-      above_row[-1] = left_available ? above_ptr[-1] : 129;
+      above_row[-1] = left_available ? above_ref[-1] : 129;
     }
   } else {
     vpx_memset(above_row, 127, bs * 2);
     above_row[-1] = 127;
   }
 
+  // predict
   if (mode == DC_PRED) {
-    dc_pred[left_available][up_available][txsz](pred_ptr, stride,
-                                                above_row, left_col);
+    dc_pred[left_available][up_available][tx_size](dst, dst_stride,
+                                                   const_above_row, left_col);
   } else {
-    pred[mode][txsz](pred_ptr, stride, above_row, left_col);
+    pred[mode][tx_size](dst, dst_stride, const_above_row, left_col);
   }
 }
 
-void vp9_predict_intra_block(MACROBLOCKD *xd,
-                            int block_idx,
-                            int bwl_in,
-                            TX_SIZE tx_size,
-                            int mode,
-                            uint8_t *reference, int ref_stride,
-                            uint8_t *predictor, int pre_stride) {
+void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
+                            TX_SIZE tx_size, int mode,
+                            const uint8_t *ref, int ref_stride,
+                            uint8_t *dst, int dst_stride) {
   const int bwl = bwl_in - tx_size;
   const int wmask = (1 << bwl) - 1;
   const int have_top = (block_idx >> bwl) || xd->up_available;
@@ -415,10 +380,6 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
   const int have_right = ((block_idx & wmask) != wmask);
 
   assert(bwl >= 0);
-  build_intra_predictors(reference, ref_stride,
-                         predictor, pre_stride,
-                         mode,
-                         tx_size,
-                         have_top, have_left,
-                         have_right);
+  build_intra_predictors(ref, ref_stride, dst, dst_stride, mode, tx_size,
+                         have_top, have_left, have_right);
 }
diff --git a/libvpx/vp9/common/vp9_reconintra.h b/libvpx/vp9/common/vp9_reconintra.h
index e369a71..e9d0dbf 100644
--- a/libvpx/vp9/common/vp9_reconintra.h
+++ b/libvpx/vp9/common/vp9_reconintra.h
@@ -14,17 +14,8 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-MB_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                               int stride, int n,
-                                               int tx, int ty);
-
-MB_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, int block,
-                                          uint8_t *ptr, int stride);
-
-void vp9_predict_intra_block(MACROBLOCKD *xd,
-                            int block_idx,
-                            int bwl_in,
-                            TX_SIZE tx_size,
-                            int mode, uint8_t *ref, int ref_stride,
-                            uint8_t *predictor, int pre_stride);
+void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
+                            TX_SIZE tx_size, int mode,
+                            const uint8_t *ref, int ref_stride,
+                            uint8_t *dst, int dst_stride);
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh
index 6bb3cb8..042afbb 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.sh
+++ b/libvpx/vp9/common/vp9_rtcd_defs.sh
@@ -21,10 +21,11 @@ EOF
 forward_decls vp9_common_forward_decls
 
 # x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
-[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse2_x86inc=sse2  && ssse3_x86inc=ssse3
+[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse &&
+  sse2_x86inc=sse2 && ssse3_x86inc=ssse3
 
 # this variable is for functions that are 64 bit only.
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2  && ssse3_x86_64=ssse3
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3
 
 #
 # Dequant
@@ -45,160 +46,160 @@ specialize vp9_idct_add_32x32
 #
 # RECON
 #
-prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d27_predictor_4x4
+prototype void vp9_d207_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_d207_predictor_4x4
 
-prototype void vp9_d45_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_4x4 ssse3
+prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_d45_predictor_4x4 $ssse3_x86inc
 
-prototype void vp9_d63_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d63_predictor_4x4
 
-prototype void vp9_h_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_h_predictor_4x4 ssse3
+prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_h_predictor_4x4 $ssse3_x86inc
 
-prototype void vp9_d117_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d117_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d117_predictor_4x4
 
-prototype void vp9_d135_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d135_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d135_predictor_4x4
 
-prototype void vp9_d153_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d153_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d153_predictor_4x4
 
-prototype void vp9_v_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_v_predictor_4x4 sse
+prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_v_predictor_4x4 $sse_x86inc
 
-prototype void vp9_tm_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_tm_predictor_4x4 sse
+prototype void vp9_tm_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_tm_predictor_4x4 $sse_x86inc
 
-prototype void vp9_dc_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_dc_predictor_4x4 sse
+prototype void vp9_dc_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_dc_predictor_4x4 $sse_x86inc
 
-prototype void vp9_dc_top_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_top_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_top_predictor_4x4
 
-prototype void vp9_dc_left_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_left_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_left_predictor_4x4
 
-prototype void vp9_dc_128_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_128_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_128_predictor_4x4
 
-prototype void vp9_d27_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d27_predictor_8x8
+prototype void vp9_d207_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_d207_predictor_8x8
 
-prototype void vp9_d45_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_8x8 ssse3
+prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_d45_predictor_8x8 $ssse3_x86inc
 
-prototype void vp9_d63_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d63_predictor_8x8
 
-prototype void vp9_h_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_h_predictor_8x8 ssse3
+prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_h_predictor_8x8 $ssse3_x86inc
 
-prototype void vp9_d117_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d117_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d117_predictor_8x8
 
-prototype void vp9_d135_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d135_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d135_predictor_8x8
 
-prototype void vp9_d153_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d153_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d153_predictor_8x8
 
-prototype void vp9_v_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_v_predictor_8x8 sse
+prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_v_predictor_8x8 $sse_x86inc
 
-prototype void vp9_tm_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_tm_predictor_8x8 sse2
+prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_tm_predictor_8x8 $sse2_x86inc
 
-prototype void vp9_dc_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_dc_predictor_8x8 sse
+prototype void vp9_dc_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_dc_predictor_8x8 $sse_x86inc
 
-prototype void vp9_dc_top_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_top_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_top_predictor_8x8
 
-prototype void vp9_dc_left_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_left_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_left_predictor_8x8
 
-prototype void vp9_dc_128_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_128_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_128_predictor_8x8
 
-prototype void vp9_d27_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d27_predictor_16x16
+prototype void vp9_d207_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_d207_predictor_16x16
 
-prototype void vp9_d45_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_16x16 ssse3
+prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_d45_predictor_16x16 $ssse3_x86inc
 
-prototype void vp9_d63_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d63_predictor_16x16
 
-prototype void vp9_h_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_h_predictor_16x16 ssse3
+prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_h_predictor_16x16 $ssse3_x86inc
 
-prototype void vp9_d117_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d117_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d117_predictor_16x16
 
-prototype void vp9_d135_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d135_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d135_predictor_16x16
 
-prototype void vp9_d153_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d153_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d153_predictor_16x16
 
-prototype void vp9_v_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_v_predictor_16x16 sse2
+prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_v_predictor_16x16 $sse2_x86inc
 
-prototype void vp9_tm_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_tm_predictor_16x16 sse2
+prototype void vp9_tm_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_tm_predictor_16x16 $sse2_x86inc
 
-prototype void vp9_dc_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_dc_predictor_16x16 sse2
+prototype void vp9_dc_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_dc_predictor_16x16 $sse2_x86inc
 
-prototype void vp9_dc_top_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_top_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_top_predictor_16x16
 
-prototype void vp9_dc_left_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_left_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_left_predictor_16x16
 
-prototype void vp9_dc_128_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_128_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_128_predictor_16x16
 
-prototype void vp9_d27_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d27_predictor_32x32
+prototype void vp9_d207_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_d207_predictor_32x32
 
-prototype void vp9_d45_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_32x32 ssse3
+prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_d45_predictor_32x32 $ssse3_x86inc
 
-prototype void vp9_d63_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d63_predictor_32x32
 
-prototype void vp9_h_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_h_predictor_32x32 ssse3
+prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_h_predictor_32x32 $ssse3 x86inc
 
-prototype void vp9_d117_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d117_predictor_32x32
 
-prototype void vp9_d135_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d135_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d135_predictor_32x32
 
-prototype void vp9_d153_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_d153_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d153_predictor_32x32
 
-prototype void vp9_v_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_v_predictor_32x32 sse2
+prototype void vp9_v_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_v_predictor_32x32 $sse2_x86inc
 
-prototype void vp9_tm_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_tm_predictor_32x32 sse2_x86_64
+prototype void vp9_tm_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_tm_predictor_32x32 $sse2_x86_64
 
-prototype void vp9_dc_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_dc_predictor_32x32 sse2
+prototype void vp9_dc_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
+specialize vp9_dc_predictor_32x32 $sse2_x86inc
 
-prototype void vp9_dc_top_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_top_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_top_predictor_32x32
 
-prototype void vp9_dc_left_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_left_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_left_predictor_32x32
 
-prototype void vp9_dc_128_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+prototype void vp9_dc_128_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_128_predictor_32x32
 
 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
@@ -236,7 +237,7 @@ specialize vp9_loop_filter_horizontal_edge mmx neon
 #
 # post proc
 #
-if [ "$CONFIG_POSTPROC" = "yes" ]; then
+if [ "$CONFIG_VP9_POSTPROC" = "yes" ]; then
 prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit"
 specialize vp9_mbpost_proc_down mmx sse2
 vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm
@@ -267,10 +268,10 @@ specialize vp9_blend_b
 # Sub Pixel Filters
 #
 prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_copy $sse2_x86inc
+specialize vp9_convolve_copy $sse2_x86inc neon
 
 prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_avg $sse2_x86inc
+specialize vp9_convolve_avg $sse2_x86inc neon
 
 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8 ssse3 neon
@@ -294,40 +295,40 @@ specialize vp9_convolve8_avg_vert ssse3 neon
 # dct
 #
 prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_1_add sse2
+specialize vp9_short_idct4x4_1_add sse2 neon
 
 prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_add sse2
+specialize vp9_short_idct4x4_add sse2 neon
 
 prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_1_add sse2
+specialize vp9_short_idct8x8_1_add sse2 neon
 
 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct8x8_add sse2 neon
 
 prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct10_8x8_add sse2
+specialize vp9_short_idct10_8x8_add sse2 neon
 
 prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_1_add sse2
+specialize vp9_short_idct16x16_1_add sse2 neon
 
 prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_add sse2
+specialize vp9_short_idct16x16_add sse2 neon
 
 prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct10_16x16_add sse2
+specialize vp9_short_idct10_16x16_add sse2 neon
 
 prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct32x32_add sse2
+specialize vp9_short_idct32x32_add sse2 neon
 
 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_32x32
 
 prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht4x4_add sse2
+specialize vp9_short_iht4x4_add sse2 neon
 
 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht8x8_add sse2
+specialize vp9_short_iht8x8_add sse2 neon
 
 prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
 specialize vp9_short_iht16x16_add sse2
@@ -342,12 +343,6 @@ specialize vp9_short_iwalsh4x4_1_add
 prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_iwalsh4x4_add
 
-prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
-specialize vp9_sad32x3
-
-prototype unsigned int vp9_sad3x32 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
-specialize vp9_sad3x32
-
 #
 # Encoder functions below this point.
 #
@@ -356,217 +351,214 @@ if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then
 
 # variance
 prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance32x16 sse2
+specialize vp9_variance32x16 $sse2_x86inc
 
 prototype unsigned int vp9_variance16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance16x32 sse2
+specialize vp9_variance16x32 $sse2_x86inc
 
 prototype unsigned int vp9_variance64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance64x32 sse2
+specialize vp9_variance64x32 $sse2_x86inc
 
 prototype unsigned int vp9_variance32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance32x64 sse2
+specialize vp9_variance32x64 $sse2_x86inc
 
 prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance32x32 sse2
+specialize vp9_variance32x32 $sse2_x86inc
 
 prototype unsigned int vp9_variance64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance64x64 sse2
+specialize vp9_variance64x64 $sse2_x86inc
 
 prototype unsigned int vp9_variance16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance16x16 mmx sse2
+specialize vp9_variance16x16 mmx $sse2_x86inc
 
 prototype unsigned int vp9_variance16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance16x8 mmx sse2
+specialize vp9_variance16x8 mmx $sse2_x86inc
 
 prototype unsigned int vp9_variance8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance8x16 mmx sse2
+specialize vp9_variance8x16 mmx $sse2_x86inc
 
 prototype unsigned int vp9_variance8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance8x8 mmx sse2
+specialize vp9_variance8x8 mmx $sse2_x86inc
 
 prototype void vp9_get_sse_sum_8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"
 specialize vp9_get_sse_sum_8x8 sse2
 vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2
 
 prototype unsigned int vp9_variance8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance8x4 sse2
+specialize vp9_variance8x4 $sse2_x86inc
 
 prototype unsigned int vp9_variance4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance4x8 sse2
+specialize vp9_variance4x8 $sse2_x86inc
 
 prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance4x4 mmx sse2
+specialize vp9_variance4x4 mmx $sse2_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64 sse2 ssse3
+specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3
+specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x64 sse2 ssse3
+specialize vp9_sub_pixel_variance32x64 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3
+specialize vp9_sub_pixel_avg_variance32x64 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x32 sse2 ssse3
+specialize vp9_sub_pixel_variance64x32 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3
+specialize vp9_sub_pixel_avg_variance64x32 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x16 sse2 ssse3
+specialize vp9_sub_pixel_variance32x16 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3
+specialize vp9_sub_pixel_avg_variance32x16 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x32 sse2 ssse3
+specialize vp9_sub_pixel_variance16x32 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3
+specialize vp9_sub_pixel_avg_variance16x32 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32 sse2 ssse3
+specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3
+specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x16 sse2 ssse3
+specialize vp9_sub_pixel_variance16x16 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3
+specialize vp9_sub_pixel_avg_variance16x16 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x16 sse2 ssse3
+specialize vp9_sub_pixel_variance8x16 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3
+specialize vp9_sub_pixel_avg_variance8x16 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x8 sse2 ssse3
+specialize vp9_sub_pixel_variance16x8 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3
+specialize vp9_sub_pixel_avg_variance16x8 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x8 sse2 ssse3
+specialize vp9_sub_pixel_variance8x8 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3
+specialize vp9_sub_pixel_avg_variance8x8 $sse2_x86inc $ssse3_x86inc
 
 # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
 prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x4 sse2 ssse3
+specialize vp9_sub_pixel_variance8x4 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3
+specialize vp9_sub_pixel_avg_variance8x4 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x8 sse ssse3
+specialize vp9_sub_pixel_variance4x8 $sse_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x8 sse ssse3
+specialize vp9_sub_pixel_avg_variance4x8 $sse_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x4 sse ssse3
+specialize vp9_sub_pixel_variance4x4 $sse_x86inc $ssse3_x86inc
 #vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
 
 prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x4 sse ssse3
+specialize vp9_sub_pixel_avg_variance4x4 $sse_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad64x64 sse2
+specialize vp9_sad64x64 $sse2_x86inc
 
 prototype unsigned int vp9_sad32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad32x64 sse2
+specialize vp9_sad32x64 $sse2_x86inc
 
 prototype unsigned int vp9_sad64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad64x32 sse2
+specialize vp9_sad64x32 $sse2_x86inc
 
 prototype unsigned int vp9_sad32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad32x16 sse2
+specialize vp9_sad32x16 $sse2_x86inc
 
 prototype unsigned int vp9_sad16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad16x32 sse2
+specialize vp9_sad16x32 $sse2_x86inc
 
 prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad32x32 sse2
+specialize vp9_sad32x32 $sse2_x86inc
 
 prototype unsigned int vp9_sad16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad16x16 mmx sse2
+specialize vp9_sad16x16 mmx $sse2_x86inc
 
 prototype unsigned int vp9_sad16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad16x8 mmx sse2
+specialize vp9_sad16x8 mmx $sse2_x86inc
 
 prototype unsigned int vp9_sad8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad8x16 mmx sse2
+specialize vp9_sad8x16 mmx $sse2_x86inc
 
 prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad8x8 mmx sse2
+specialize vp9_sad8x8 mmx $sse2_x86inc
 
 prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad8x4 sse2
+specialize vp9_sad8x4 $sse2_x86inc
 
 prototype unsigned int vp9_sad4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad4x8 sse
+specialize vp9_sad4x8 $sse_x86inc
 
 prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad4x4 mmx sse
+specialize vp9_sad4x4 mmx $sse_x86inc
 
 prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad64x64_avg sse2
+specialize vp9_sad64x64_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad32x64_avg sse2
+specialize vp9_sad32x64_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad64x32_avg sse2
+specialize vp9_sad64x32_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad32x16_avg sse2
+specialize vp9_sad32x16_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad16x32_avg sse2
+specialize vp9_sad16x32_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad32x32_avg sse2
+specialize vp9_sad32x32_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad16x16_avg sse2
+specialize vp9_sad16x16_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad16x8_avg sse2
+specialize vp9_sad16x8_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad8x16_avg sse2
+specialize vp9_sad8x16_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad8x8_avg sse2
+specialize vp9_sad8x8_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad8x4_avg sse2
+specialize vp9_sad8x4_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad4x8_avg sse
+specialize vp9_sad4x8_avg $sse_x86inc
 
 prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad4x4_avg sse
+specialize vp9_sad4x4_avg $sse_x86inc
 
 prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_h sse2
-vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
+specialize vp9_variance_halfpixvar16x16_h $sse2_x86inc
 
 prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_v sse2
-vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
+specialize vp9_variance_halfpixvar16x16_v $sse2_x86inc
 
 prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_hv sse2
-vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
+specialize vp9_variance_halfpixvar16x16_hv $sse2_x86inc
 
 prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar64x64_h
@@ -678,8 +670,7 @@ specialize vp9_sad4x4x4d sse
 #specialize vp9_sub_pixel_mse16x16 sse2 mmx
 
 prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
-specialize vp9_mse16x16 mmx sse2
-vp9_mse16x16_sse2=vp9_mse16x16_wmt
+specialize vp9_mse16x16 mmx $sse2_x86inc
 
 prototype unsigned int vp9_mse8x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
 specialize vp9_mse8x16
@@ -743,7 +734,7 @@ prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int p
 specialize vp9_short_fdct8x4 sse2
 
 prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct32x32
+specialize vp9_short_fdct32x32 sse2
 
 prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct32x32_rd sse2
diff --git a/libvpx/vp9/common/vp9_scale.c b/libvpx/vp9/common/vp9_scale.c
new file mode 100644
index 0000000..989206c
--- /dev/null
+++ b/libvpx/vp9/common/vp9_scale.c
@@ -0,0 +1,146 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_scale.h"
+
+static INLINE int scaled_x(int val, const struct scale_factors *scale) {
+  return val * scale->x_scale_fp >> REF_SCALE_SHIFT;
+}
+
+static INLINE int scaled_y(int val, const struct scale_factors *scale) {
+  return val * scale->y_scale_fp >> REF_SCALE_SHIFT;
+}
+
+static int unscaled_value(int val, const struct scale_factors *scale) {
+  (void) scale;
+  return val;
+}
+
+static MV32 scaled_mv(const MV *mv, const struct scale_factors *scale) {
+  const MV32 res = {
+    scaled_y(mv->row, scale) + scale->y_offset_q4,
+    scaled_x(mv->col, scale) + scale->x_offset_q4
+  };
+  return res;
+}
+
+static MV32 unscaled_mv(const MV *mv, const struct scale_factors *scale) {
+  const MV32 res = {
+    mv->row,
+    mv->col
+  };
+  return res;
+}
+
+static void set_offsets_with_scaling(struct scale_factors *scale,
+                                     int row, int col) {
+  scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale) & SUBPEL_MASK;
+  scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale) & SUBPEL_MASK;
+}
+
+static void set_offsets_without_scaling(struct scale_factors *scale,
+                                        int row, int col) {
+  scale->x_offset_q4 = 0;
+  scale->y_offset_q4 = 0;
+}
+
+static int get_fixed_point_scale_factor(int other_size, int this_size) {
+  // Calculate scaling factor once for each reference frame
+  // and use fixed point scaling factors in decoding and encoding routines.
+  // Hardware implementations can calculate scale factor in device driver
+  // and use multiplication and shifting on hardware instead of division.
+  return (other_size << REF_SCALE_SHIFT) / this_size;
+}
+
+static int check_scale_factors(int other_w, int other_h,
+                               int this_w, int this_h) {
+  return 2 * this_w >= other_w &&
+         2 * this_h >= other_h &&
+         this_w <= 16 * other_w &&
+         this_h <= 16 * other_h;
+}
+
+void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h) {
+  if (!check_scale_factors(other_w, other_h, this_w, this_h)) {
+    scale->x_scale_fp = REF_INVALID_SCALE;
+    scale->y_scale_fp = REF_INVALID_SCALE;
+    return;
+  }
+
+  scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+  scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+  scale->x_step_q4 = scaled_x(16, scale);
+  scale->y_step_q4 = scaled_y(16, scale);
+  scale->x_offset_q4 = 0;  // calculated per block
+  scale->y_offset_q4 = 0;  // calculated per block
+
+  if (vp9_is_scaled(scale)) {
+    scale->scale_value_x = scaled_x;
+    scale->scale_value_y = scaled_y;
+    scale->set_scaled_offsets = set_offsets_with_scaling;
+    scale->scale_mv = scaled_mv;
+  } else {
+    scale->scale_value_x = unscaled_value;
+    scale->scale_value_y = unscaled_value;
+    scale->set_scaled_offsets = set_offsets_without_scaling;
+    scale->scale_mv = unscaled_mv;
+  }
+
+  // TODO(agrange): Investigate the best choice of functions to use here
+  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
+  // to do at full-pel offsets. The current selection, where the filter is
+  // applied in one direction only, and not at all for 0,0, seems to give the
+  // best quality, but it may be worth trying an additional mode that does
+  // do the filtering on full-pel.
+  if (scale->x_step_q4 == 16) {
+    if (scale->y_step_q4 == 16) {
+      // No scaling in either direction.
+      scale->predict[0][0][0] = vp9_convolve_copy;
+      scale->predict[0][0][1] = vp9_convolve_avg;
+      scale->predict[0][1][0] = vp9_convolve8_vert;
+      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale->predict[1][0][0] = vp9_convolve8_horiz;
+      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+    } else {
+      // No scaling in x direction. Must always scale in the y direction.
+      scale->predict[0][0][0] = vp9_convolve8_vert;
+      scale->predict[0][0][1] = vp9_convolve8_avg_vert;
+      scale->predict[0][1][0] = vp9_convolve8_vert;
+      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale->predict[1][0][0] = vp9_convolve8;
+      scale->predict[1][0][1] = vp9_convolve8_avg;
+    }
+  } else {
+    if (scale->y_step_q4 == 16) {
+      // No scaling in the y direction. Must always scale in the x direction.
+      scale->predict[0][0][0] = vp9_convolve8_horiz;
+      scale->predict[0][0][1] = vp9_convolve8_avg_horiz;
+      scale->predict[0][1][0] = vp9_convolve8;
+      scale->predict[0][1][1] = vp9_convolve8_avg;
+      scale->predict[1][0][0] = vp9_convolve8_horiz;
+      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+    } else {
+      // Must always scale in both directions.
+      scale->predict[0][0][0] = vp9_convolve8;
+      scale->predict[0][0][1] = vp9_convolve8_avg;
+      scale->predict[0][1][0] = vp9_convolve8;
+      scale->predict[0][1][1] = vp9_convolve8_avg;
+      scale->predict[1][0][0] = vp9_convolve8;
+      scale->predict[1][0][1] = vp9_convolve8_avg;
+    }
+  }
+  // 2D subpel motion always gets filtered in both directions
+  scale->predict[1][1][0] = vp9_convolve8;
+  scale->predict[1][1][1] = vp9_convolve8_avg;
+}
diff --git a/libvpx/vp9/common/vp9_scale.h b/libvpx/vp9/common/vp9_scale.h
new file mode 100644
index 0000000..7a720d0
--- /dev/null
+++ b/libvpx/vp9/common/vp9_scale.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_SCALE_H_
+#define VP9_COMMON_VP9_SCALE_H_
+
+#include "vp9/common/vp9_mv.h"
+#include "vp9/common/vp9_convolve.h"
+
+#define REF_SCALE_SHIFT 14
+#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
+#define REF_INVALID_SCALE -1
+
+struct scale_factors {
+  int x_scale_fp;   // horizontal fixed point scale factor
+  int y_scale_fp;   // vertical fixed point scale factor
+  int x_offset_q4;
+  int x_step_q4;
+  int y_offset_q4;
+  int y_step_q4;
+
+  int (*scale_value_x)(int val, const struct scale_factors *scale);
+  int (*scale_value_y)(int val, const struct scale_factors *scale);
+  void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
+  MV32 (*scale_mv)(const MV *mv, const struct scale_factors *scale);
+
+  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
+};
+
+void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h);
+
+static int vp9_is_valid_scale(const struct scale_factors *sf) {
+  return sf->x_scale_fp != REF_INVALID_SCALE &&
+         sf->y_scale_fp != REF_INVALID_SCALE;
+}
+
+static int vp9_is_scaled(const struct scale_factors *sf) {
+  return sf->x_scale_fp != REF_NO_SCALE ||
+         sf->y_scale_fp != REF_NO_SCALE;
+}
+
+#endif  //  VP9_COMMON_VP9_SCALE_H_
diff --git a/libvpx/vp9/common/vp9_subpelvar.h b/libvpx/vp9/common/vp9_subpelvar.h
index ad674f1..fe75481 100644
--- a/libvpx/vp9/common/vp9_subpelvar.h
+++ b/libvpx/vp9/common/vp9_subpelvar.h
@@ -11,7 +11,8 @@
 #ifndef VP9_COMMON_VP9_SUBPELVAR_H_
 #define VP9_COMMON_VP9_SUBPELVAR_H_
 
-#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_convolve.h"
 
 static void variance(const uint8_t *src_ptr,
                      int  source_stride,
@@ -78,10 +79,10 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
 
   for (i = 0; i < output_height; i++) {
     for (j = 0; j < output_width; j++) {
-      // Apply bilinear filter
-      output_ptr[j] = (((int)src_ptr[0]          * vp9_filter[0]) +
-                       ((int)src_ptr[pixel_step] * vp9_filter[1]) +
-                       (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
+      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                          (int)src_ptr[pixel_step] * vp9_filter[1],
+                          FILTER_BITS);
+
       src_ptr++;
     }
 
@@ -127,20 +128,16 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
                                                unsigned int output_width,
                                                const int16_t *vp9_filter) {
   unsigned int  i, j;
-  int  Temp;
 
   for (i = 0; i < output_height; i++) {
     for (j = 0; j < output_width; j++) {
-      // Apply filter
-      Temp = ((int)src_ptr[0]         * vp9_filter[0]) +
-             ((int)src_ptr[pixel_step] * vp9_filter[1]) +
-             (VP9_FILTER_WEIGHT / 2);
-      output_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);
+      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                          (int)src_ptr[pixel_step] * vp9_filter[1],
+                          FILTER_BITS);
       src_ptr++;
     }
 
-    // Next row...
-    src_ptr    += src_pixels_per_line - output_width;
+    src_ptr += src_pixels_per_line - output_width;
     output_ptr += output_width;
   }
 }
diff --git a/libvpx/vp9/common/vp9_systemdependent.h b/libvpx/vp9/common/vp9_systemdependent.h
index 1b9147e..cc909e2 100644
--- a/libvpx/vp9/common/vp9_systemdependent.h
+++ b/libvpx/vp9/common/vp9_systemdependent.h
@@ -34,6 +34,6 @@ static int round(double x) {
 #endif
 
 struct VP9Common;
-void vp9_machine_specific_config(struct VP9Common *);
+void vp9_machine_specific_config(struct VP9Common *cm);
 
 #endif  // VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
diff --git a/libvpx/vp9/common/vp9_tile_common.c b/libvpx/vp9/common/vp9_tile_common.c
index a72d2ab..1791c1a 100644
--- a/libvpx/vp9/common/vp9_tile_common.c
+++ b/libvpx/vp9/common/vp9_tile_common.c
@@ -14,7 +14,7 @@
 #define MAX_TILE_WIDTH_B64 64
 
 static int to_sbs(n_mis) {
-  return mi_cols_aligned_to_sb(n_mis) >> LOG2_MI_BLOCK_SIZE;
+  return mi_cols_aligned_to_sb(n_mis) >> MI_BLOCK_SIZE_LOG2;
 }
 
 static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off,
diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 4af4f94..fa4dd9b 100644
--- a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -17,23 +17,11 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
                                             const unsigned char *_blimit,
                                             const unsigned char *_limit,
                                             const unsigned char *_thresh) {
-  DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]);
-  DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]);
-
-  DECLARE_ALIGNED(16, unsigned char, flat_op[3][8]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq[3][8]);
-
-  DECLARE_ALIGNED(16, unsigned char, ap[8][8]);
-  DECLARE_ALIGNED(16, unsigned char, aq[8][8]);
-
-
   __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
-  __m128i p7, p6, p5;
-  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
-  __m128i q5, q6, q7;
-  int i = 0;
+  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+  __m128i abs_p1p0;
   const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
   const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
   const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
@@ -44,41 +32,35 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
   const __m128i blimit =
       _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
 
-  p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
-  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
-  q4 = _mm_loadl_epi64((__m128i *)(s + 4 * p));
-
-  _mm_storel_epi64((__m128i *)ap[4], p4);
-  _mm_storel_epi64((__m128i *)ap[3], p3);
-  _mm_storel_epi64((__m128i *)ap[2], p2);
-  _mm_storel_epi64((__m128i *)ap[1], p1);
-  _mm_storel_epi64((__m128i *)ap[0], p0);
-  _mm_storel_epi64((__m128i *)aq[4], q4);
-  _mm_storel_epi64((__m128i *)aq[3], q3);
-  _mm_storel_epi64((__m128i *)aq[2], q2);
-  _mm_storel_epi64((__m128i *)aq[1], q1);
-  _mm_storel_epi64((__m128i *)aq[0], q0);
-
+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+  q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
+                                       (__m64 *)(s + 4 * p)));
+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
+                                       (__m64 *)(s + 3 * p)));
+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
+                                       (__m64 *)(s + 2 * p)));
+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
+                                       (__m64 *)(s + 1 * p)));
+  p1q1 = _mm_shuffle_epi32(q1p1, 78);
+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
+                                       (__m64 *)(s - 0 * p)));
+  p0q0 = _mm_shuffle_epi32(q0p0, 78);
 
   {
-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                                          _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                                          _mm_subs_epu8(q0, q1));
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                                    _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                                    _mm_subs_epu8(q1, p1));
-    __m128i work;
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+    abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+                            _mm_subs_epu8(q0p0, q1p1));
+    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
+    fe = _mm_set1_epi8(0xfe);
+    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+                            _mm_subs_epu8(p0q0, q0p0));
+    abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+                            _mm_subs_epu8(p1q1, q1p1));
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     hev = _mm_subs_epu8(flat, thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
@@ -88,19 +70,16 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
+    mask = _mm_max_epu8(abs_p1p0, mask);
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
-                                     _mm_subs_epu8(p1, p2)),
-                         _mm_or_si128(_mm_subs_epu8(p3, p2),
-                                      _mm_subs_epu8(p2, p3)));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
-                                     _mm_subs_epu8(q1, q2)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q2),
-                                      _mm_subs_epu8(q2, q3)));
+
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+                                     _mm_subs_epu8(q1p1, q2p2)),
+                        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+                                     _mm_subs_epu8(q2p2, q3p3)));
     mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     mask = _mm_subs_epu8(mask, limit);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
@@ -110,21 +89,19 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
     const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    __m128i ps1 = _mm_xor_si128(p1, t80);
-    __m128i ps0 = _mm_xor_si128(p0, t80);
-    __m128i qs0 = _mm_xor_si128(q0, t80);
-    __m128i qs1 = _mm_xor_si128(q1, t80);
+    const __m128i t1 = _mm_set1_epi16(0x1);
+    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+    __m128i qs0 = _mm_xor_si128(p0q0, t80);
+    __m128i qs1 = _mm_xor_si128(p1q1, t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
+    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
 
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, qs0ps0);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
@@ -134,82 +111,60 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    filter1 = _mm_unpacklo_epi8(zero, filter1);
+    filter1 = _mm_srai_epi16(filter1, 0xB);
+    filter2 = _mm_unpacklo_epi8(zero, filter2);
+    filter2 = _mm_srai_epi16(filter2, 0xB);
 
-    /* Filter2 >> 3 */
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    /* Filter1 >> 3 */
+    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
 
     /* filt >> 1 */
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-    filt = _mm_andnot_si128(hev, filt);
-    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    filt = _mm_adds_epi16(filter1, t1);
+    filt = _mm_srai_epi16(filt, 1);
+    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+                            filt);
+    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
     // loopfilter done
 
     {
       __m128i work;
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
-                                       _mm_subs_epu8(p0, p2)),
-                           _mm_or_si128(_mm_subs_epu8(q2, q0),
-                                        _mm_subs_epu8(q0, q2)));
-      flat = _mm_max_epu8(work, flat);
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
-                                       _mm_subs_epu8(p0, p3)),
-                           _mm_or_si128(_mm_subs_epu8(q3, q0),
-                                        _mm_subs_epu8(q0, q3)));
-      flat = _mm_max_epu8(work, flat);
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
-                                       _mm_subs_epu8(p0, p4)),
-                           _mm_or_si128(_mm_subs_epu8(q4, q0),
-                                        _mm_subs_epu8(q0, q4)));
+      flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+                                       _mm_subs_epu8(q0p0, q2p2)),
+                          _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+                                       _mm_subs_epu8(q0p0, q3p3)));
+      flat = _mm_max_epu8(abs_p1p0, flat);
+      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
       flat = _mm_subs_epu8(flat, one);
       flat = _mm_cmpeq_epi8(flat, zero);
       flat = _mm_and_si128(flat, mask);
 
-      p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
-      q5 = _mm_loadl_epi64((__m128i *)(s + 5 * p));
-      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
-                                       _mm_subs_epu8(p0, p5)),
-                           _mm_or_si128(_mm_subs_epu8(q5, q0),
-                                        _mm_subs_epu8(q0, q5)));
-      _mm_storel_epi64((__m128i *)ap[5], p5);
-      _mm_storel_epi64((__m128i *)aq[5], q5);
-      flat2 = _mm_max_epu8(work, flat2);
-      p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
-      q6 = _mm_loadl_epi64((__m128i *)(s + 6 * p));
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
-                                       _mm_subs_epu8(p0, p6)),
-                           _mm_or_si128(_mm_subs_epu8(q6, q0),
-                                        _mm_subs_epu8(q0, q6)));
-      _mm_storel_epi64((__m128i *)ap[6], p6);
-      _mm_storel_epi64((__m128i *)aq[6], q6);
-      flat2 = _mm_max_epu8(work, flat2);
+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+      q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
+                                           (__m64 *)(s + 5 * p)));
+
+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+      q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
+                                           (__m64 *)(s + 6 * p)));
+
+      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+                                        _mm_subs_epu8(q0p0, q4p4)),
+                           _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+                                        _mm_subs_epu8(q0p0, q5p5)));
+
+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+      q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
+                                           (__m64 *)(s + 7 * p)));
+
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+                                       _mm_subs_epu8(q0p0, q6p6)),
+                          _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+                                       _mm_subs_epu8(q0p0, q7p7)));
 
-      p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
-      q7 = _mm_loadl_epi64((__m128i *)(s + 7 * p));
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
-                                       _mm_subs_epu8(p0, p7)),
-                           _mm_or_si128(_mm_subs_epu8(q7, q0),
-                                        _mm_subs_epu8(q0, q7)));
-      _mm_storel_epi64((__m128i *)ap[7], p7);
-      _mm_storel_epi64((__m128i *)aq[7], q7);
       flat2 = _mm_max_epu8(work, flat2);
+      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
       flat2 = _mm_subs_epu8(flat2, one);
       flat2 = _mm_cmpeq_epi8(flat2, zero);
       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
@@ -220,260 +175,198 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
     {
       const __m128i eight = _mm_set1_epi16(8);
       const __m128i four = _mm_set1_epi16(4);
-      {
-        __m128i workp_shft;
-        __m128i a, b, c;
-
-        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7])), zero);
-        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6])), zero);
-        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5])), zero);
-        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4])), zero);
-        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3])), zero);
-        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2])), zero);
-        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1])), zero);
-        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0])), zero);
-        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0])), zero);
-        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1])), zero);
-        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2])), zero);
-        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3])), zero);
-        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4])), zero);
-        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5])), zero);
-        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6])), zero);
-        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7])), zero);
-
-        c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
-        c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
-
-        b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
-        a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
-        a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
-
-        _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
-
-        c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q1, a);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
-        _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
-
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q2, a);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
-        _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
-
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q3, a);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
-        _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
-
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        b = _mm_add_epi16(q3, b);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
-        _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
-
-        c = _mm_add_epi16(q4, c);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        b = _mm_add_epi16(q3, b);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
-        _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
-        a = _mm_add_epi16(q5, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q6, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-      }
+      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+      __m128i pixelFilter_p, pixelFilter_q;
+      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+      p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
+      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+                                    _mm_add_epi16(p4_16, p3_16));
+      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+                                    _mm_add_epi16(q4_16, q3_16));
+
+      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+      pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+      pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+      pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
+                                                         pixelFilter_q));
+      pixetFilter_p2p1p0 =   _mm_add_epi16(four,
+                                           _mm_add_epi16(pixetFilter_p2p1p0,
+                                                         pixetFilter_q2q1q0));
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                           _mm_add_epi16(p7_16, p0_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                           _mm_add_epi16(q7_16, q0_16)), 4);
+      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                           _mm_add_epi16(p3_16, p0_16)), 3);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                           _mm_add_epi16(q3_16, q0_16)), 3);
+
+      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(p7_16, p7_16);
+      sum_q7 = _mm_add_epi16(q7_16, q7_16);
+      sum_p3 = _mm_add_epi16(p3_16, p3_16);
+      sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p1_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q1_16)), 4);
+      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                             _mm_add_epi16(sum_p3, p1_16)), 3);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                             _mm_add_epi16(sum_q3, q1_16)), 3);
+      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p2_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q2_16)), 4);
+      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                           _mm_add_epi16(sum_p3, p2_16)), 3);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                           _mm_add_epi16(sum_q3, q2_16)), 3);
+      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p3_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q3_16)), 4);
+      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p4_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q4_16)), 4);
+      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p5_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q5_16)), 4);
+      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p6_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q6_16)), 4);
+      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
     }
     // wide flat
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    work_a = _mm_loadl_epi64((__m128i *)ap[2]);
-    p2 = _mm_loadl_epi64((__m128i *)flat_op[2]);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-    _mm_storel_epi64((__m128i *)flat_op[2], p2);
-
-    p1 = _mm_loadl_epi64((__m128i *)flat_op[1]);
-    work_a = _mm_andnot_si128(flat, ps1);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-    _mm_storel_epi64((__m128i *)flat_op[1], p1);
-
-    p0 = _mm_loadl_epi64((__m128i *)flat_op[0]);
-    work_a = _mm_andnot_si128(flat, ps0);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-    _mm_storel_epi64((__m128i *)flat_op[0], p0);
-
-    q0 = _mm_loadl_epi64((__m128i *)flat_oq[0]);
-    work_a = _mm_andnot_si128(flat, qs0);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-    _mm_storel_epi64((__m128i *)flat_oq[0], q0);
-
-    q1 = _mm_loadl_epi64((__m128i *)flat_oq[1]);
-    work_a = _mm_andnot_si128(flat, qs1);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-    _mm_storel_epi64((__m128i *)flat_oq[1], q1);
-
-    work_a = _mm_loadl_epi64((__m128i *)aq[2]);
-    q2 = _mm_loadl_epi64((__m128i *)flat_oq[2]);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-    _mm_storel_epi64((__m128i *)flat_oq[2], q2);
-
-    // write out op6 - op3
-    {
-      unsigned char *dst = (s - 7 * p);
-      for (i = 6; i > 2; i--) {
-        __m128i flat2_output;
-        work_a = _mm_loadl_epi64((__m128i *)ap[i]);
-        flat2_output = _mm_loadl_epi64((__m128i *)flat2_op[i]);
-        work_a = _mm_andnot_si128(flat2, work_a);
-        flat2_output = _mm_and_si128(flat2, flat2_output);
-        work_a = _mm_or_si128(work_a, flat2_output);
-        _mm_storel_epi64((__m128i *)dst, work_a);
-        dst += p;
-      }
-    }
-
-    work_a = _mm_loadl_epi64((__m128i *)flat_op[2]);
-    p2 = _mm_loadl_epi64((__m128i *)flat2_op[2]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    p2 = _mm_and_si128(flat2, p2);
-    p2 = _mm_or_si128(work_a, p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-
-    work_a = _mm_loadl_epi64((__m128i *)flat_op[1]);
-    p1 = _mm_loadl_epi64((__m128i *)flat2_op[1]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    p1 = _mm_and_si128(flat2, p1);
-    p1 = _mm_or_si128(work_a, p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-
-    work_a = _mm_loadl_epi64((__m128i *)flat_op[0]);
-    p0 = _mm_loadl_epi64((__m128i *)flat2_op[0]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    p0 = _mm_and_si128(flat2, p0);
-    p0 = _mm_or_si128(work_a, p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-
-    work_a = _mm_loadl_epi64((__m128i *)flat_oq[0]);
-    q0 = _mm_loadl_epi64((__m128i *)flat2_oq[0]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    q0 = _mm_and_si128(flat2, q0);
-    q0 = _mm_or_si128(work_a, q0);
-    _mm_storel_epi64((__m128i *)(s - 0 * p), q0);
-
-    work_a = _mm_loadl_epi64((__m128i *)flat_oq[1]);
-    q1 = _mm_loadl_epi64((__m128i *)flat2_oq[1]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    q1 = _mm_and_si128(flat2, q1);
-    q1 = _mm_or_si128(work_a, q1);
-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-
-    work_a = _mm_loadl_epi64((__m128i *)flat_oq[2]);
-    q2 = _mm_loadl_epi64((__m128i *)flat2_oq[2]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    q2 = _mm_and_si128(flat2, q2);
-    q2 = _mm_or_si128(work_a, q2);
-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-
-    // write out oq3 - oq7
-    {
-      unsigned char *dst = (s + 3 * p);
-      for (i = 3; i < 7; i++) {
-        __m128i flat2_output;
-        work_a = _mm_loadl_epi64((__m128i *)aq[i]);
-        flat2_output = _mm_loadl_epi64((__m128i *)flat2_oq[i]);
-        work_a = _mm_andnot_si128(flat2, work_a);
-        flat2_output = _mm_and_si128(flat2, flat2_output);
-        work_a = _mm_or_si128(work_a, flat2_output);
-        _mm_storel_epi64((__m128i *)dst, work_a);
-        dst += p;
-      }
-    }
+    flat = _mm_shuffle_epi32(flat, 68);
+    flat2 = _mm_shuffle_epi32(flat2, 68);
+
+    q2p2 = _mm_andnot_si128(flat, q2p2);
+    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+    q6p6 = _mm_andnot_si128(flat2, q6p6);
+    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
+    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+
+    q5p5 = _mm_andnot_si128(flat2, q5p5);
+    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+
+    q4p4 = _mm_andnot_si128(flat2, q4p4);
+    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+
+    q3p3 = _mm_andnot_si128(flat2, q3p3);
+    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+
+    q2p2 = _mm_andnot_si128(flat2, q2p2);
+    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+
+    q1p1 = _mm_andnot_si128(flat2, q1p1);
+    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+
+    q0p0 = _mm_andnot_si128(flat2, q0p0);
+    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+    _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
   }
 }
 
diff --git a/libvpx/vp9/decoder/vp9_dboolhuff.c b/libvpx/vp9/decoder/vp9_dboolhuff.c
index 31b1ae2..06acec4 100644
--- a/libvpx/vp9/decoder/vp9_dboolhuff.c
+++ b/libvpx/vp9/decoder/vp9_dboolhuff.c
@@ -16,7 +16,7 @@
 // This is meant to be a large, positive constant that can still be efficiently
 // loaded as an immediate (on platforms like ARM, for example).
 // Even relatively modest values like 100 would work fine.
-#define VP9_LOTS_OF_BITS 0x40000000
+#define LOTS_OF_BITS 0x40000000
 
 
 int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) {
@@ -41,13 +41,13 @@ void vp9_reader_fill(vp9_reader *r) {
   const uint8_t *buffer = r->buffer;
   VP9_BD_VALUE value = r->value;
   int count = r->count;
-  int shift = VP9_BD_VALUE_SIZE - 8 - (count + 8);
+  int shift = BD_VALUE_SIZE - 8 - (count + 8);
   int loop_end = 0;
   const int bits_left = (int)((buffer_end - buffer)*CHAR_BIT);
   const int x = shift + CHAR_BIT - bits_left;
 
   if (x >= 0) {
-    count += VP9_LOTS_OF_BITS;
+    count += LOTS_OF_BITS;
     loop_end = x;
   }
 
@@ -66,7 +66,7 @@ void vp9_reader_fill(vp9_reader *r) {
 
 const uint8_t *vp9_reader_find_end(vp9_reader *r) {
   // Find the end of the coded buffer
-  while (r->count > CHAR_BIT && r->count < VP9_BD_VALUE_SIZE) {
+  while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) {
     r->count -= CHAR_BIT;
     r->buffer--;
   }
@@ -83,10 +83,10 @@ int vp9_reader_has_error(vp9_reader *r) {
   //
   // When reading a byte from the user's buffer, count is filled with 8 and
   // one byte is filled into the value buffer. When we reach the end of the
-  // data, count is additionally filled with VP9_LOTS_OF_BITS. So when
-  // count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted.
+  // data, count is additionally filled with LOTS_OF_BITS. So when
+  // count == LOTS_OF_BITS - 1, the user's data has been exhausted.
   //
   // 1 if we have tried to decode bits after the end of stream was encountered.
   // 0 No error.
-  return r->count > VP9_BD_VALUE_SIZE && r->count < VP9_LOTS_OF_BITS;
+  return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
 }
diff --git a/libvpx/vp9/decoder/vp9_dboolhuff.h b/libvpx/vp9/decoder/vp9_dboolhuff.h
index c46dd73..c864516 100644
--- a/libvpx/vp9/decoder/vp9_dboolhuff.h
+++ b/libvpx/vp9/decoder/vp9_dboolhuff.h
@@ -20,7 +20,7 @@
 
 typedef size_t VP9_BD_VALUE;
 
-#define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
+#define BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
 
 typedef struct {
   const uint8_t *buffer_end;
@@ -52,7 +52,7 @@ static int vp9_read(vp9_reader *br, int probability) {
   value = br->value;
   count = br->count;
 
-  bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);
+  bigsplit = (VP9_BD_VALUE)split << (BD_VALUE_SIZE - 8);
 
   range = split;
 
diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c
index a3e2ad3..84a29b1 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libvpx/vp9/decoder/vp9_decodemv.c
@@ -43,7 +43,7 @@ static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
 }
 
 static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                     BLOCK_SIZE_TYPE bsize, vp9_reader *r) {
+                                     BLOCK_SIZE bsize, vp9_reader *r) {
   const uint8_t context = vp9_get_pred_context_tx_size(xd);
   const vp9_prob *tx_probs = get_tx_probs(bsize, context, &cm->fc.tx_probs);
   TX_SIZE tx_size = vp9_read(r, tx_probs[0]);
@@ -58,7 +58,7 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
 }
 
 static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode,
-                            BLOCK_SIZE_TYPE bsize, int allow_select,
+                            BLOCK_SIZE bsize, int allow_select,
                             vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
@@ -75,7 +75,7 @@ static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode,
     return TX_4X4;
 }
 
-static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
                            int mi_row, int mi_col, int segment_id) {
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int bw = 1 << mi_width_log2(bsize);
@@ -94,8 +94,8 @@ static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
 static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
                                  vp9_reader *r) {
   MACROBLOCKD *const xd = &pbi->mb;
-  struct segmentation *const seg = &xd->seg;
-  const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  struct segmentation *const seg = &pbi->common.seg;
+  const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type;
   int segment_id;
 
   if (!seg->enabled)
@@ -113,8 +113,8 @@ static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
                                  vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  struct segmentation *const seg = &xd->seg;
-  const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  struct segmentation *const seg = &cm->seg;
+  const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type;
   int pred_segment_id, segment_id;
 
   if (!seg->enabled)
@@ -126,9 +126,9 @@ static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
     return pred_segment_id;
 
   if (seg->temporal_update) {
-    const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(xd);
+    const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
     const int pred_flag = vp9_read(r, pred_prob);
-    vp9_set_pred_flag_seg_id(cm, bsize, mi_row, mi_col, pred_flag);
+    vp9_set_pred_flag_seg_id(xd, pred_flag);
     segment_id = pred_flag ? pred_segment_id
                            : read_segment_id(r, seg);
   } else {
@@ -141,7 +141,7 @@ static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
 static uint8_t read_skip_coeff(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  int skip_coeff = vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP);
+  int skip_coeff = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
   if (!skip_coeff) {
     const int ctx = vp9_get_pred_context_mbskip(xd);
     skip_coeff = vp9_read(r, vp9_get_pred_prob_mbskip(cm, xd));
@@ -155,19 +155,20 @@ static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   MB_MODE_INFO *const mbmi = &m->mbmi;
-  const BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
-  const int mis = cm->mode_info_stride;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const MODE_INFO *above_mi = xd->mi_8x8[-cm->mode_info_stride];
+  const MODE_INFO *left_mi = xd->mi_8x8[-1];
 
   mbmi->segment_id = read_intra_segment_id(pbi, mi_row, mi_col, r);
-  mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
-  mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r);
+  mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
+  mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r);
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE;
 
-  if (bsize >= BLOCK_SIZE_SB8X8) {
-    const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
+  if (bsize >= BLOCK_8X8) {
+    const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0);
     const MB_PREDICTION_MODE L = xd->left_available ?
-                                  left_block_mode(m, 0) : DC_PRED;
+                                  left_block_mode(m, left_mi, 0) : DC_PRED;
     mbmi->mode = read_intra_mode(r, vp9_kf_y_mode_prob[A][L]);
   } else {
     // Only 4x4, 4x8, 8x4 blocks
@@ -178,9 +179,9 @@ static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int ib = idy * 2 + idx;
-        const MB_PREDICTION_MODE A = above_block_mode(m, ib, mis);
+        const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, ib);
         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
-                                      left_block_mode(m, ib) : DC_PRED;
+                                     left_block_mode(m, left_mi, ib) : DC_PRED;
         const MB_PREDICTION_MODE b_mode = read_intra_mode(r,
                                               vp9_kf_y_mode_prob[A][L]);
         m->bmi[ib].as_mode = b_mode;
@@ -251,7 +252,7 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
 }
 
 static void update_mv(vp9_reader *r, vp9_prob *p) {
-  if (vp9_read(r, VP9_NMV_UPDATE_PROB))
+  if (vp9_read(r, NMV_UPDATE_PROB))
     *p = (vp9_read_literal(r, 7) << 1) | 1;
 }
 
@@ -303,8 +304,8 @@ static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r,
   FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
 
-  if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME)) {
-    ref_frame[0] = vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME);
+  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    ref_frame[0] = vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
     ref_frame[1] = NONE;
   } else {
     const int comp_ctx = vp9_get_pred_context_comp_inter_inter(cm, xd);
@@ -345,17 +346,17 @@ static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r,
 
 static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
-  for (j = 0; j < VP9_SWITCHABLE_FILTERS + 1; ++j)
-    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i)
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+  for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j)
+    for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
+      if (vp9_read(r, MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
 }
 
 static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-    for (j = 0; j < VP9_INTER_MODES - 1; ++j)
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+    for (j = 0; j < INTER_MODES - 1; ++j)
+      if (vp9_read(r, MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
 }
 
@@ -370,23 +371,23 @@ static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
     VP9D_COMP *pbi, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  const vp9_prob *probs = vp9_get_pred_probs_switchable_interp(cm, xd);
-  const int index = treed_read(r, vp9_switchable_interp_tree, probs);
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
-  ++cm->counts.switchable_interp[ctx][index];
-  return vp9_switchable_interp[index];
+  const int type = treed_read(r, vp9_switchable_interp_tree,
+                              cm->fc.switchable_interp_prob[ctx]);
+  ++cm->counts.switchable_interp[ctx][type];
+  return type;
 }
 
 static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
                                   vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
 
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE;
 
-  if (bsize >= BLOCK_SIZE_SB8X8) {
+  if (bsize >= BLOCK_8X8) {
     const int size_group = size_group_lookup[bsize];
     mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[size_group]);
     cm->counts.y_mode[size_group][mbmi->mode]++;
@@ -420,8 +421,8 @@ static int read_is_inter_block(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
 
-  if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME)) {
-    return vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) !=
+  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) !=
            INTRA_FRAME;
   } else {
     const int ctx = vp9_get_pred_context_intra_inter(xd);
@@ -439,54 +440,56 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   int_mv *const mv0 = &mbmi->mv[0];
   int_mv *const mv1 = &mbmi->mv[1];
-  const BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = xd->allow_high_precision_mv;
 
   int_mv nearest, nearby, best_mv;
   int_mv nearest_second, nearby_second, best_mv_second;
   uint8_t inter_mode_ctx;
-  MV_REFERENCE_FRAME ref0, ref1;
+  MV_REFERENCE_FRAME ref0;
+  int is_compound;
 
+  mbmi->uv_mode = DC_PRED;
   read_ref_frames(pbi, r, mbmi->segment_id, mbmi->ref_frame);
   ref0 = mbmi->ref_frame[0];
-  ref1 = mbmi->ref_frame[1];
+  is_compound = has_second_ref(mbmi);
 
-  vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
-                   ref0, mbmi->ref_mvs[ref0], cm->ref_frame_sign_bias,
+  vp9_find_mv_refs(cm, xd, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0],
                    mi_row, mi_col);
 
-  inter_mode_ctx = mbmi->mb_mode_context[ref0];
+  inter_mode_ctx = mbmi->mode_context[ref0];
 
-  if (vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP))
+  if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
-  else if (bsize >= BLOCK_SIZE_SB8X8)
-    mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
-
-  mbmi->uv_mode = DC_PRED;
+    assert(bsize >= BLOCK_8X8);
+  } else {
+    if (bsize >= BLOCK_8X8)
+      mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
+  }
 
   // nearest, nearby
-  if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
+  if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
     vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref0], &nearest, &nearby);
     best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int;
   }
 
-  mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
-                            ? read_switchable_filter_type(pbi, r)
-                            : cm->mcomp_filter_type;
-
-  if (ref1 > INTRA_FRAME) {
-    vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
-                     ref1, mbmi->ref_mvs[ref1], cm->ref_frame_sign_bias,
-                     mi_row, mi_col);
+  if (is_compound) {
+    const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+    vp9_find_mv_refs(cm, xd, mi, xd->last_mi,
+                     ref1, mbmi->ref_mvs[ref1], mi_row, mi_col);
 
-    if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
+    if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
       vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref1],
                             &nearest_second, &nearby_second);
       best_mv_second.as_int = mbmi->ref_mvs[ref1][0].as_int;
     }
   }
 
-  if (bsize < BLOCK_SIZE_SB8X8) {
+  mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
+                              ? read_switchable_filter_type(pbi, r)
+                              : cm->mcomp_filter_type;
+
+  if (bsize < BLOCK_8X8) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
     int idx, idy;
@@ -500,7 +503,7 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
           vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0,
                                         mi_row, mi_col);
 
-          if (ref1 > 0)
+          if (is_compound)
             vp9_append_sub8x8_mvs_for_idx(cm, xd,  &nearest_second,
                                          &nearby_second, j, 1,
                                          mi_row, mi_col);
@@ -511,30 +514,30 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
             read_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
                     &cm->counts.mv, allow_hp);
 
-            if (ref1 > 0)
+            if (is_compound)
               read_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
                       &cm->counts.mv, allow_hp);
             break;
           case NEARESTMV:
             blockmv.as_int = nearest.as_int;
-            if (ref1 > 0)
+            if (is_compound)
               secondmv.as_int = nearest_second.as_int;
             break;
           case NEARMV:
             blockmv.as_int = nearby.as_int;
-            if (ref1 > 0)
+            if (is_compound)
               secondmv.as_int = nearby_second.as_int;
             break;
           case ZEROMV:
             blockmv.as_int = 0;
-            if (ref1 > 0)
+            if (is_compound)
               secondmv.as_int = 0;
             break;
           default:
             assert(!"Invalid inter mode value");
         }
         mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
-        if (ref1 > 0)
+        if (is_compound)
           mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
 
         if (num_4x4_h == 2)
@@ -551,33 +554,25 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
     switch (mbmi->mode) {
       case NEARMV:
         mv0->as_int = nearby.as_int;
-        clamp_mv2(&mv0->as_mv, xd);
-
-        if (ref1 > 0) {
+        if (is_compound)
           mv1->as_int = nearby_second.as_int;
-          clamp_mv2(&mv1->as_mv, xd);
-        }
         break;
 
       case NEARESTMV:
         mv0->as_int = nearest.as_int;
-        clamp_mv2(&mv0->as_mv, xd);
-
-        if (ref1 > 0) {
+        if (is_compound)
           mv1->as_int = nearest_second.as_int;
-          clamp_mv2(&mv1->as_mv, xd);
-        }
         break;
 
       case ZEROMV:
         mv0->as_int = 0;
-        if (ref1 > 0)
+        if (is_compound)
           mv1->as_int = 0;
         break;
 
       case NEWMV:
         read_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->counts.mv, allow_hp);
-        if (ref1 > 0)
+        if (is_compound)
           read_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc, &cm->counts.mv,
                   allow_hp);
         break;
@@ -596,10 +591,10 @@ static void read_inter_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
   mbmi->mv[0].as_int = 0;
   mbmi->mv[1].as_int = 0;
   mbmi->segment_id = read_inter_segment_id(pbi, mi_row, mi_col, r);
-  mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
+  mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
   inter_block = read_is_inter_block(pbi, mbmi->segment_id, r);
-  mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, mbmi->sb_type,
-                                 !mbmi->mb_skip_coeff || !inter_block, r);
+  mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, mbmi->sb_type,
+                               !mbmi->skip_coeff || !inter_block, r);
 
   if (inter_block)
     read_inter_block_mode_info(pbi, mi, mi_row, mi_col, r);
@@ -615,20 +610,20 @@ static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
 
   if (cm->comp_pred_mode == HYBRID_PREDICTION)
     for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+      if (vp9_read(r, MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
 
   if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
     for (i = 0; i < REF_CONTEXTS; i++) {
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+      if (vp9_read(r, MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+      if (vp9_read(r, MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
     }
 
   if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
     for (i = 0; i < REF_CONTEXTS; i++)
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+      if (vp9_read(r, MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
 }
 
@@ -639,7 +634,7 @@ void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
   // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.
   // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
   for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-    if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+    if (vp9_read(r, MODE_UPDATE_PROB))
       vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]);
 
   if (cm->frame_type != KEY_FRAME && !cm->intra_only) {
@@ -653,19 +648,19 @@ void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
       read_switchable_interp_probs(&cm->fc, r);
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+      if (vp9_read(r, MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]);
 
     read_comp_pred(cm, r);
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
-      for (i = 0; i < VP9_INTRA_MODES - 1; ++i)
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+      for (i = 0; i < INTRA_MODES - 1; ++i)
+        if (vp9_read(r, MODE_UPDATE_PROB))
           vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]);
 
     for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j)
       for (i = 0; i < PARTITION_TYPES - 1; ++i)
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        if (vp9_read(r, MODE_UPDATE_PROB))
           vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]);
 
     read_mv_probs(r, nmvc, xd->allow_high_precision_mv);
@@ -675,20 +670,21 @@ void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
 void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  MODE_INFO *mi = xd->mode_info_context;
-  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  MODE_INFO *mi = xd->this_mi;
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
   const int bw = 1 << mi_width_log2(bsize);
   const int bh = 1 << mi_height_log2(bsize);
   const int y_mis = MIN(bh, cm->mi_rows - mi_row);
   const int x_mis = MIN(bw, cm->mi_cols - mi_col);
-  int x, y;
+  int x, y, z;
 
   if (cm->frame_type == KEY_FRAME || cm->intra_only)
     read_intra_frame_mode_info(pbi, mi, mi_row, mi_col, r);
   else
     read_inter_frame_mode_info(pbi, mi, mi_row, mi_col, r);
 
-  for (y = 0; y < y_mis; y++)
-    for (x = !y; x < x_mis; x++)
-      mi[y * cm->mode_info_stride + x] = *mi;
+  for (y = 0, z = 0; y < y_mis; y++, z += cm->mode_info_stride)
+    for (x = !y; x < x_mis; x++) {
+        xd->mi_8x8[z + x] = mi;
+      }
 }
diff --git a/libvpx/vp9/decoder/vp9_decodframe.c b/libvpx/vp9/decoder/vp9_decodframe.c
index feb6024..34ed0c7 100644
--- a/libvpx/vp9/decoder/vp9_decodframe.c
+++ b/libvpx/vp9/decoder/vp9_decodframe.c
@@ -63,44 +63,40 @@ static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 3; ++j)
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+      if (vp9_read(r, MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 2; ++j)
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+      if (vp9_read(r, MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 1; ++j)
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+      if (vp9_read(r, MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
 }
 
-static void init_dequantizer(VP9_COMMON *cm, MACROBLOCKD *xd) {
+static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
   int i;
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  xd->q_index = vp9_get_qindex(xd, segment_id, cm->base_qindex);
+  xd->plane[0].dequant = cm->y_dequant[q_index];
 
-  xd->plane[0].dequant = cm->y_dequant[xd->q_index];
   for (i = 1; i < MAX_MB_PLANE; i++)
-    xd->plane[i].dequant = cm->uv_dequant[xd->q_index];
+    xd->plane[i].dequant = cm->uv_dequant[q_index];
 }
 
-static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                         int ss_txfrm_size, void *arg) {
+static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
+                         TX_SIZE tx_size, void *arg) {
   MACROBLOCKD* const xd = arg;
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16);
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
   const int stride = pd->dst.stride;
   const int eob = pd->eobs[block];
-  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
-                                                       block, ss_txfrm_size);
-  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
-                                                 raster_block,
+  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
+                                                       block);
+  uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
                                                  pd->dst.buf, stride);
-
-  switch (ss_txfrm_size / 2) {
+  switch (tx_size) {
     case TX_4X4: {
       const TX_TYPE tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
       if (tx_type == DCT_DCT)
@@ -120,87 +116,78 @@ static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
     case TX_32X32:
       vp9_idct_add_32x32(qcoeff, dst, stride, eob);
       break;
+    default:
+      assert(!"Invalid transform size");
   }
 }
 
-static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                               int ss_txfrm_size, void *arg) {
+static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                               TX_SIZE tx_size, void *arg) {
   MACROBLOCKD* const xd = arg;
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  MODE_INFO *const mi = xd->mode_info_context;
-
-  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
-                                                       block, ss_txfrm_size);
-  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
-                                                 raster_block,
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  MODE_INFO *const mi = xd->this_mi;
+  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
+                                                       block);
+  uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
                                                  pd->dst.buf, pd->dst.stride);
-  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
-  int b_mode;
-  int plane_b_size;
-  const int tx_ib = raster_block >> tx_size;
-  const int mode = plane == 0 ? mi->mbmi.mode
-                              : mi->mbmi.uv_mode;
-
-  if (plane == 0 && mi->mbmi.sb_type < BLOCK_8X8) {
-    assert(bsize == BLOCK_8X8);
-    b_mode = mi->bmi[raster_block].as_mode;
-  } else {
-    b_mode = mode;
-  }
+  const MB_PREDICTION_MODE mode = (plane == 0)
+        ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[raster_block].as_mode
+                                          : mi->mbmi.mode)
+        : mi->mbmi.uv_mode;
 
   if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0)
-    extend_for_intra(xd, plane, block, bsize, ss_txfrm_size);
+    extend_for_intra(xd, plane_bsize, plane, block, tx_size);
 
-  plane_b_size = b_width_log2(bsize) - pd->subsampling_x;
-  vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
-                          dst, pd->dst.stride,
-                          dst, pd->dst.stride);
+  vp9_predict_intra_block(xd, raster_block >> tx_size,
+                          b_width_log2(plane_bsize), tx_size, mode,
+                          dst, pd->dst.stride, dst, pd->dst.stride);
 
-  // Early exit if there are no coefficients
-  if (mi->mbmi.mb_skip_coeff)
-    return;
-
-  decode_block(plane, block, bsize, ss_txfrm_size, arg);
+  if (!mi->mbmi.skip_coeff)
+    decode_block(plane, block, plane_bsize, tx_size, arg);
 }
 
-static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize, vp9_reader *r) {
+static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE bsize, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
+  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
 
-  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
-    vp9_reset_sb_tokens_context(xd, bsize);
+  if (mbmi->skip_coeff) {
+    reset_skip_context(xd, bsize);
     return -1;
   } else {
-    if (xd->seg.enabled)
-      init_dequantizer(&pbi->common, xd);
+    if (cm->seg.enabled)
+      setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
+                                                  cm->base_qindex));
 
     // TODO(dkovalev) if (!vp9_reader_has_error(r))
     return vp9_decode_tokens(pbi, r, bsize);
   }
 }
 
-static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize,
+static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize,
                         int mi_row, int mi_col) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  const int bh = 1 << mi_height_log2(bsize);
-  const int bw = 1 << mi_width_log2(bsize);
-  const int mi_idx = mi_row * cm->mode_info_stride + mi_col;
-  int i;
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int offset = mi_row * cm->mode_info_stride + mi_col;
+
+  xd->mode_info_stride = cm->mode_info_stride;
+
+  xd->mi_8x8 = cm->mi_grid_visible + offset;
+  xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset;
+
+  // we are using the mode info context stream here
+  xd->this_mi =
+  xd->mi_8x8[0] = xd->mic_stream_ptr;
+  xd->this_mi->mbmi.sb_type = bsize;
+  xd->mic_stream_ptr++;
 
-  xd->mode_info_context = cm->mi + mi_idx;
-  xd->mode_info_context->mbmi.sb_type = bsize;
   // Special case: if prev_mi is NULL, the previous mode info context
   // cannot be used.
-  xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + mi_idx : NULL;
-
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    struct macroblockd_plane *pd = &xd->plane[i];
-    pd->above_context = cm->above_context[i] +
-                            (mi_col * 2 >> pd->subsampling_x);
-    pd->left_context = cm->left_context[i] +
-                           (((mi_row * 2) & 15) >> pd->subsampling_y);
-  }
+  xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
 
+  set_skip_context(cm, xd, mi_row, mi_col);
   set_partition_seg_context(cm, xd, mi_row, mi_col);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
@@ -213,17 +200,21 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize,
 static void set_ref(VP9D_COMP *pbi, int i, int mi_row, int mi_col) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  const int ref = mbmi->ref_frame[i] - 1;
-
+  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+  const int ref = mbmi->ref_frame[i] - LAST_FRAME;
   const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[ref]];
-  xd->scale_factor[i] = cm->active_ref_scale[ref];
-  setup_pre_planes(xd, i, cfg, mi_row, mi_col, &xd->scale_factor[i]);
+  const struct scale_factors *sf = &cm->active_ref_scale[ref];
+  if (!vp9_is_valid_scale(sf))
+    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                       "Invalid scale factors");
+
+  xd->scale_factor[i] = *sf;
+  setup_pre_planes(xd, i, cfg, mi_row, mi_col, sf);
   xd->corrupted |= cfg->corrupted;
 }
 
 static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
-                           vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+                           vp9_reader *r, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   const int less8x8 = bsize < BLOCK_8X8;
@@ -240,7 +231,7 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
     bsize = BLOCK_8X8;
 
   // Has to be called after set_offsets
-  mbmi = &xd->mode_info_context->mbmi;
+  mbmi = &xd->this_mi->mbmi;
 
   if (!is_inter_block(mbmi)) {
     // Intra reconstruction
@@ -251,7 +242,7 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
     int eobtotal;
 
     set_ref(pbi, 0, mi_row, mi_col);
-    if (mbmi->ref_frame[1] > INTRA_FRAME)
+    if (has_second_ref(mbmi))
       set_ref(pbi, 1, mi_row, mi_col);
 
     vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
@@ -264,7 +255,7 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
       assert(mbmi->sb_type == bsize);
       if (eobtotal == 0)
         // skip loopfilter
-        vp9_set_pred_flag_mbskip(cm, bsize, mi_row, mi_col, 1);
+        vp9_set_pred_flag_mbskip(xd, bsize, 1);
       else if (eobtotal > 0)
         foreach_transformed_block(xd, bsize, decode_block, xd);
     }
@@ -273,14 +264,14 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
 }
 
 static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
-                            vp9_reader* r, BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const pc = &pbi->common;
+                            vp9_reader* r, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  int bs = (1 << mi_width_log2(bsize)) / 2, n;
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
   PARTITION_TYPE partition = PARTITION_NONE;
-  BLOCK_SIZE_TYPE subsize;
+  BLOCK_SIZE subsize;
 
-  if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols)
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
   if (bsize < BLOCK_8X8) {
@@ -288,24 +279,25 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
       return;
   } else {
     int pl;
-    const int idx = check_bsize_coverage(pc, mi_row, mi_col, bsize);
-    set_partition_seg_context(pc, xd, mi_row, mi_col);
+    const int idx = check_bsize_coverage(hbs, cm->mi_rows, cm->mi_cols,
+                                         mi_row, mi_col);
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
 
     if (idx == 0)
       partition = treed_read(r, vp9_partition_tree,
-                             pc->fc.partition_prob[pc->frame_type][pl]);
+                             cm->fc.partition_prob[cm->frame_type][pl]);
     else if (idx > 0 &&
-        !vp9_read(r, pc->fc.partition_prob[pc->frame_type][pl][idx]))
+        !vp9_read(r, cm->fc.partition_prob[cm->frame_type][pl][idx]))
       partition = (idx == 1) ? PARTITION_HORZ : PARTITION_VERT;
     else
       partition = PARTITION_SPLIT;
 
-    pc->counts.partition[pl][partition]++;
+    cm->counts.partition[pl][partition]++;
   }
 
   subsize = get_subsize(bsize, partition);
-  *(get_sb_index(xd, subsize)) = 0;
+  *get_sb_index(xd, subsize) = 0;
 
   switch (partition) {
     case PARTITION_NONE:
@@ -313,23 +305,24 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
       break;
     case PARTITION_HORZ:
       decode_modes_b(pbi, mi_row, mi_col, r, subsize);
-      *(get_sb_index(xd, subsize)) = 1;
-      if (mi_row + bs < pc->mi_rows)
-        decode_modes_b(pbi, mi_row + bs, mi_col, r, subsize);
+      *get_sb_index(xd, subsize) = 1;
+      if (mi_row + hbs < cm->mi_rows)
+        decode_modes_b(pbi, mi_row + hbs, mi_col, r, subsize);
       break;
     case PARTITION_VERT:
       decode_modes_b(pbi, mi_row, mi_col, r, subsize);
-      *(get_sb_index(xd, subsize)) = 1;
-      if (mi_col + bs < pc->mi_cols)
-        decode_modes_b(pbi, mi_row, mi_col + bs, r, subsize);
+      *get_sb_index(xd, subsize) = 1;
+      if (mi_col + hbs < cm->mi_cols)
+        decode_modes_b(pbi, mi_row, mi_col + hbs, r, subsize);
       break;
-    case PARTITION_SPLIT:
+    case PARTITION_SPLIT: {
+      int n;
       for (n = 0; n < 4; n++) {
-        int j = n >> 1, i = n & 0x01;
-        *(get_sb_index(xd, subsize)) = n;
-        decode_modes_sb(pbi, mi_row + j * bs, mi_col + i * bs, r, subsize);
+        const int j = n >> 1, i = n & 1;
+        *get_sb_index(xd, subsize) = n;
+        decode_modes_sb(pbi, mi_row + j * hbs, mi_col + i * hbs, r, subsize);
       }
-      break;
+    } break;
     default:
       assert(!"Invalid partition type");
   }
@@ -337,7 +330,7 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) {
-    set_partition_seg_context(pc, xd, mi_row, mi_col);
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
     update_partition_context(xd, subsize, bsize);
   }
 }
@@ -345,18 +338,18 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
 static void setup_token_decoder(VP9D_COMP *pbi,
                                 const uint8_t *data, size_t read_size,
                                 vp9_reader *r) {
-  VP9_COMMON *pc = &pbi->common;
+  VP9_COMMON *cm = &pbi->common;
   const uint8_t *data_end = pbi->source + pbi->source_sz;
 
   // Validate the calculated partition length. If the buffer
   // described by the partition can't be fully read, then restrict
   // it to the portion that can be (for EC mode) or throw an error.
   if (!read_is_valid(data, read_size, data_end))
-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt tile length");
 
   if (vp9_reader_init(r, data, read_size))
-    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder %d", 1);
 }
 
@@ -470,8 +463,7 @@ static void setup_loopfilter(struct loopfilter *lf,
 
 static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
   const int old = *delta_q;
-  if (vp9_rb_read_bit(rb))
-    *delta_q = vp9_rb_read_signed_literal(rb, 4);
+  *delta_q = vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0;
   return old != *delta_q;
 }
 
@@ -498,8 +490,11 @@ static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
 
 static INTERPOLATIONFILTERTYPE read_interp_filter_type(
     struct vp9_read_bit_buffer *rb) {
+  const INTERPOLATIONFILTERTYPE literal_to_type[] = { EIGHTTAP_SMOOTH,
+                                                      EIGHTTAP,
+                                                      EIGHTTAP_SHARP };
   return vp9_rb_read_bit(rb) ? SWITCHABLE
-                             : vp9_rb_read_literal(rb, 2);
+                             : literal_to_type[vp9_rb_read_literal(rb, 2)];
 }
 
 static void read_frame_size(struct vp9_read_bit_buffer *rb,
@@ -552,8 +547,8 @@ static void setup_frame_size(VP9D_COMP *pbi,
                              struct vp9_read_bit_buffer *rb) {
   int width, height;
   read_frame_size(rb, &width, &height);
-  setup_display_size(&pbi->common, rb);
   apply_frame_size(pbi, width, height);
+  setup_display_size(&pbi->common, rb);
 }
 
 static void setup_frame_size_with_refs(VP9D_COMP *pbi,
@@ -579,35 +574,36 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Referenced frame with invalid size");
 
-  setup_display_size(cm, rb);
   apply_frame_size(pbi, width, height);
+  setup_display_size(cm, rb);
 }
 
 static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
   const int num_threads = pbi->oxcf.max_threads;
-  VP9_COMMON *const pc = &pbi->common;
+  VP9_COMMON *const cm = &pbi->common;
   int mi_row, mi_col;
+  YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[cm->new_fb_idx];
 
   if (pbi->do_loopfilter_inline) {
     if (num_threads > 1) {
       LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
-      lf_data->frame_buffer = &pbi->common.yv12_fb[pbi->common.new_fb_idx];
-      lf_data->cm = pc;
+      lf_data->frame_buffer = fb;
+      lf_data->cm = cm;
       lf_data->xd = pbi->mb;
+      lf_data->stop = 0;
       lf_data->y_only = 0;
     }
-    vp9_loop_filter_frame_init(pc, &pbi->mb, pbi->mb.lf.filter_level);
+    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
   }
 
-  for (mi_row = pc->cur_tile_mi_row_start; mi_row < pc->cur_tile_mi_row_end;
+  for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
     // For a SB there are 2 left contexts, each pertaining to a MB row within
-    vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
-    vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));
-    for (mi_col = pc->cur_tile_mi_col_start; mi_col < pc->cur_tile_mi_col_end;
-         mi_col += MI_BLOCK_SIZE) {
+    vp9_zero(cm->left_context);
+    vp9_zero(cm->left_seg_context);
+    for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+         mi_col += MI_BLOCK_SIZE)
       decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_64X64);
-    }
 
     if (pbi->do_loopfilter_inline) {
       // delay the loopfilter by 1 macroblock row.
@@ -617,28 +613,32 @@ static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
       if (num_threads > 1) {
         LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
 
+        // decoding has completed: finish up the loop filter in this thread.
+        if (mi_row + MI_BLOCK_SIZE >= cm->cur_tile_mi_row_end) continue;
+
         vp9_worker_sync(&pbi->lf_worker);
         lf_data->start = lf_start;
         lf_data->stop = mi_row;
         pbi->lf_worker.hook = vp9_loop_filter_worker;
         vp9_worker_launch(&pbi->lf_worker);
       } else {
-        YV12_BUFFER_CONFIG *const fb =
-            &pbi->common.yv12_fb[pbi->common.new_fb_idx];
-        vp9_loop_filter_rows(fb, pc, &pbi->mb, lf_start, mi_row, 0);
+        vp9_loop_filter_rows(fb, cm, &pbi->mb, lf_start, mi_row, 0);
       }
     }
   }
 
   if (pbi->do_loopfilter_inline) {
-    YV12_BUFFER_CONFIG *const fb = &pbi->common.yv12_fb[pbi->common.new_fb_idx];
+    int lf_start;
     if (num_threads > 1) {
-      // TODO(jzern): since the loop filter is delayed one mb row, this will be
-      // forced to wait for the last row scheduled in the for loop.
+      LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+
       vp9_worker_sync(&pbi->lf_worker);
+      lf_start = lf_data->stop;
+    } else {
+      lf_start = mi_row - MI_BLOCK_SIZE;
     }
-    vp9_loop_filter_rows(fb, pc, &pbi->mb,
-                         mi_row - MI_BLOCK_SIZE, pc->mi_rows, 0);
+    vp9_loop_filter_rows(fb, cm, &pbi->mb,
+                         lf_start, cm->mi_rows, 0);
   }
 }
 
@@ -661,20 +661,20 @@ static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
 static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
   vp9_reader residual_bc;
 
-  VP9_COMMON *const pc = &pbi->common;
+  VP9_COMMON *const cm = &pbi->common;
 
   const uint8_t *const data_end = pbi->source + pbi->source_sz;
-  const int aligned_mi_cols = mi_cols_aligned_to_sb(pc->mi_cols);
-  const int tile_cols = 1 << pc->log2_tile_cols;
-  const int tile_rows = 1 << pc->log2_tile_rows;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
   int tile_row, tile_col;
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(pc->above_context[0], 0,
-             sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols);
+  vpx_memset(cm->above_context[0], 0,
+             sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE * (2 * aligned_mi_cols));
 
-  vpx_memset(pc->above_seg_context, 0,
+  vpx_memset(cm->above_seg_context, 0,
              sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
 
   if (pbi->oxcf.inv_tile_order) {
@@ -699,9 +699,9 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
     }
 
     for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      vp9_get_tile_row_offsets(pc, tile_row);
+      vp9_get_tile_row_offsets(cm, tile_row);
       for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) {
-        vp9_get_tile_col_offsets(pc, tile_col);
+        vp9_get_tile_col_offsets(cm, tile_col);
         setup_token_decoder(pbi, data_ptr2[tile_row][tile_col],
                             data_end - data_ptr2[tile_row][tile_col],
                             &residual_bc);
@@ -715,16 +715,16 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
     int has_more;
 
     for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      vp9_get_tile_row_offsets(pc, tile_row);
+      vp9_get_tile_row_offsets(cm, tile_row);
       for (tile_col = 0; tile_col < tile_cols; tile_col++) {
         size_t size;
 
-        vp9_get_tile_col_offsets(pc, tile_col);
+        vp9_get_tile_col_offsets(cm, tile_col);
 
         has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1;
         if (has_more) {
           if (!read_is_valid(data, 4, data_end))
-            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+            vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                          "Truncated packet or corrupt tile length");
 
           size = read_be32(data);
@@ -810,7 +810,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
     int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
     ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->new_fb_idx, frame_to_show);
     pbi->refresh_frame_flags = 0;
-    xd->lf.filter_level = 0;
+    cm->lf.filter_level = 0;
     return 0;
   }
 
@@ -861,7 +861,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES);
       setup_frame_size(pbi, rb);
     } else {
-       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES);
+      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES);
 
       for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
         const int ref = vp9_rb_read_literal(rb, NUM_REF_FRAMES_LOG2);
@@ -892,11 +892,11 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
   cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LOG2);
 
   if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->intra_only)
-    vp9_setup_past_independence(cm, xd);
+    vp9_setup_past_independence(cm);
 
-  setup_loopfilter(&xd->lf, rb);
+  setup_loopfilter(&cm->lf, rb);
   setup_quantization(pbi, rb);
-  setup_segmentation(&xd->seg, rb);
+  setup_segmentation(&cm->seg, rb);
 
   setup_tile_info(cm, rb);
 
@@ -937,17 +937,17 @@ void vp9_init_dequantizer(VP9_COMMON *cm) {
 
 int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   int i;
-  VP9_COMMON *const pc = &pbi->common;
+  VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
 
   const uint8_t *data = pbi->source;
   const uint8_t *data_end = pbi->source + pbi->source_sz;
 
   struct vp9_read_bit_buffer rb = { data, data_end, 0,
-                                    pc, error_handler };
+                                    cm, error_handler };
   const size_t first_partition_size = read_uncompressed_header(pbi, &rb);
-  const int keyframe = pc->frame_type == KEY_FRAME;
-  YV12_BUFFER_CONFIG *new_fb = &pc->yv12_fb[pc->new_fb_idx];
+  const int keyframe = cm->frame_type == KEY_FRAME;
+  YV12_BUFFER_CONFIG *new_fb = &cm->yv12_fb[cm->new_fb_idx];
 
   if (!first_partition_size) {
     // showing a frame directly
@@ -958,51 +958,39 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   xd->corrupted = 0;
   new_fb->corrupted = 0;
   pbi->do_loopfilter_inline =
-      (pc->log2_tile_rows | pc->log2_tile_cols) == 0 && pbi->mb.lf.filter_level;
+      (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
 
   if (!pbi->decoded_key_frame && !keyframe)
     return -1;
 
   if (!read_is_valid(data, first_partition_size, data_end))
-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt header length");
 
-  xd->mode_info_context = pc->mi;
-  xd->prev_mode_info_context = pc->prev_mi;
-  xd->mode_info_stride = pc->mode_info_stride;
-
-  init_dequantizer(pc, &pbi->mb);
+  setup_plane_dequants(cm, &pbi->mb, cm->base_qindex);
 
-  if (!keyframe)
-    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
+  xd->mi_8x8 = cm->mi_grid_visible;
+  xd->mic_stream_ptr = cm->mi;
+  xd->mode_info_stride = cm->mode_info_stride;
 
-  pc->fc = pc->frame_contexts[pc->frame_context_idx];
+  cm->fc = cm->frame_contexts[cm->frame_context_idx];
 
-  vp9_zero(pc->counts);
-
-  // Initialize xd pointers. Any reference should do for xd->pre, so use 0.
-  setup_pre_planes(xd, 0, &pc->yv12_fb[pc->active_ref_idx[0]], 0, 0, NULL);
-  setup_dst_planes(xd, new_fb, 0, 0);
+  vp9_zero(cm->counts);
 
   new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size);
 
-  // Create the segmentation map structure and set to 0
-  if (!pc->last_frame_seg_map)
-    CHECK_MEM_ERROR(pc, pc->last_frame_seg_map,
-                    vpx_calloc((pc->mi_rows * pc->mi_cols), 1));
-
-  setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y);
+  setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y);
 
   // clear out the coeff buffer
   for (i = 0; i < MAX_MB_PLANE; ++i)
     vp9_zero(xd->plane[i].qcoeff);
 
-  set_prev_mi(pc);
+  set_prev_mi(cm);
 
   *p_data_end = decode_tiles(pbi, data + first_partition_size);
 
-  pc->last_width = pc->width;
-  pc->last_height = pc->height;
+  cm->last_width = cm->width;
+  cm->last_height = cm->height;
 
   new_fb->corrupted |= xd->corrupted;
 
@@ -1010,21 +998,21 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
     if (keyframe && !new_fb->corrupted)
       pbi->decoded_key_frame = 1;
     else
-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                          "A stream must start with a complete key frame");
   }
 
-  if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
-    vp9_adapt_coef_probs(pc);
+  if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
+    vp9_adapt_coef_probs(cm);
 
-    if (!keyframe && !pc->intra_only) {
-      vp9_adapt_mode_probs(pc);
-      vp9_adapt_mv_probs(pc, xd->allow_high_precision_mv);
+    if (!keyframe && !cm->intra_only) {
+      vp9_adapt_mode_probs(cm);
+      vp9_adapt_mv_probs(cm, xd->allow_high_precision_mv);
     }
   }
 
-  if (pc->refresh_frame_context)
-    pc->frame_contexts[pc->frame_context_idx] = pc->fc;
+  if (cm->refresh_frame_context)
+    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
 
   return 0;
 }
diff --git a/libvpx/vp9/decoder/vp9_decodframe.h b/libvpx/vp9/decoder/vp9_decodframe.h
index 00b6d67..c665f6f 100644
--- a/libvpx/vp9/decoder/vp9_decodframe.h
+++ b/libvpx/vp9/decoder/vp9_decodframe.h
@@ -15,7 +15,7 @@
 struct VP9Common;
 struct VP9Decompressor;
 
-void vp9_init_dequantizer(struct VP9Common *pc);
+void vp9_init_dequantizer(struct VP9Common *cm);
 int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);
 
 #endif  // VP9_DECODER_VP9_DECODFRAME_H_
diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c
index 0021643..cd74a0b 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/libvpx/vp9/decoder/vp9_detokenize.c
@@ -94,9 +94,8 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
                         ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
   FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
-  ENTROPY_CONTEXT above_ec, left_ec;
-  const int ref = is_inter_block(&xd->mode_info_context->mbmi);
-  int band, pt, c = 0;
+  const int ref = is_inter_block(&xd->this_mi->mbmi);
+  int band, c = 0;
   vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] =
       fc->coef_probs[tx_size][type][ref];
   vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
@@ -104,41 +103,10 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
   vp9_prob *prob;
   vp9_coeff_count_model *coef_counts = counts->coef[tx_size];
   const int16_t *scan, *nb;
+  const uint8_t *band_translate;
   uint8_t token_cache[1024];
-  const uint8_t * band_translate;
-
-  switch (tx_size) {
-    default:
-    case TX_4X4: {
-      scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
-      above_ec = A[0] != 0;
-      left_ec = L[0] != 0;
-      band_translate = vp9_coefband_trans_4x4;
-      break;
-    }
-    case TX_8X8: {
-      scan = get_scan_8x8(get_tx_type_8x8(type, xd));
-      above_ec = !!*(uint16_t *)A;
-      left_ec  = !!*(uint16_t *)L;
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-    }
-    case TX_16X16: {
-      scan = get_scan_16x16(get_tx_type_16x16(type, xd));
-      above_ec = !!*(uint32_t *)A;
-      left_ec  = !!*(uint32_t *)L;
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-    }
-    case TX_32X32:
-      scan = vp9_default_scan_32x32;
-      above_ec = !!*(uint64_t *)A;
-      left_ec  = !!*(uint64_t *)L;
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-  }
-
-  pt = combine_entropy_contexts(above_ec, left_ec);
+  int pt = get_entropy_context(xd, tx_size, type, block_idx, A, L,
+                               &scan, &band_translate);
   nb = vp9_get_coef_neighbors_handle(scan);
 
   while (1) {
@@ -242,54 +210,38 @@ SKIP_START:
   return c;
 }
 
-static int get_eob(struct segmentation *seg, int segment_id, int eob_max) {
-  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
-}
-
 struct decode_block_args {
   VP9D_COMP *pbi;
   vp9_reader *r;
   int *eobtotal;
 };
 
-static void decode_block(int plane, int block,
-                         BLOCK_SIZE_TYPE bsize,
-                         int ss_txfrm_size,
-                         void *argv) {
+static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
+                         TX_SIZE tx_size, void *argv) {
   const struct decode_block_args* const arg = argv;
-  const int bw = b_width_log2(bsize);
 
   // find the maximum eob for this transform size, adjusted by segment
   MACROBLOCKD *xd = &arg->pbi->mb;
+  struct segmentation *seg = &arg->pbi->common.seg;
   struct macroblockd_plane* pd = &xd->plane[plane];
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const TX_SIZE ss_tx_size = ss_txfrm_size / 2;
-  const int seg_eob = get_eob(&xd->seg, segment_id, 16 << ss_txfrm_size);
-  const int off = block >> ss_txfrm_size;
-  const int mod = bw - ss_tx_size - pd->subsampling_x;
-  const int aoff = (off & ((1 << mod) - 1)) << ss_tx_size;
-  const int loff = (off >> mod) << ss_tx_size;
-  const int tx_size_in_blocks = 1 << ss_tx_size;
-  ENTROPY_CONTEXT *A = pd->above_context + aoff;
-  ENTROPY_CONTEXT *L = pd->left_context + loff;
-  const int eob = decode_coefs(&arg->pbi->common, xd, arg->r, block,
-                               pd->plane_type, seg_eob,
-                               BLOCK_OFFSET(pd->qcoeff, block, 16),
-                               ss_tx_size, pd->dequant, A, L);
+  const int segment_id = xd->this_mi->mbmi.segment_id;
+  const int seg_eob = get_tx_eob(seg, segment_id, tx_size);
+  int aoff, loff, eob;
+
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
+
+  eob = decode_coefs(&arg->pbi->common, xd, arg->r, block,
+                     pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block),
+                     tx_size, pd->dequant,
+                     pd->above_context + aoff, pd->left_context + loff);
+
+  set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
 
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    set_contexts_on_border(xd, bsize, plane, tx_size_in_blocks, eob, aoff, loff,
-                           A, L);
-  } else {
-    int pt;
-    for (pt = 0; pt < tx_size_in_blocks; pt++)
-      A[pt] = L[pt] = eob > 0;
-  }
   pd->eobs[block] = eob;
   *arg->eobtotal += eob;
 }
 
-int vp9_decode_tokens(VP9D_COMP *pbi, vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+int vp9_decode_tokens(VP9D_COMP *pbi, vp9_reader *r, BLOCK_SIZE bsize) {
   int eobtotal = 0;
   struct decode_block_args args = {pbi, r, &eobtotal};
   foreach_transformed_block(&pbi->mb, bsize, decode_block, &args);
diff --git a/libvpx/vp9/decoder/vp9_detokenize.h b/libvpx/vp9/decoder/vp9_detokenize.h
index f98fe8d..cf07c56 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.h
+++ b/libvpx/vp9/decoder/vp9_detokenize.h
@@ -15,6 +15,6 @@
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_dboolhuff.h"
 
-int vp9_decode_tokens(VP9D_COMP* pbi, vp9_reader *r, BLOCK_SIZE_TYPE bsize);
+int vp9_decode_tokens(VP9D_COMP* pbi, vp9_reader *r, BLOCK_SIZE bsize);
 
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/libvpx/vp9/decoder/vp9_onyxd_if.c b/libvpx/vp9/decoder/vp9_onyxd_if.c
index 5a01dd7..17d5def 100644
--- a/libvpx/vp9/decoder/vp9_onyxd_if.c
+++ b/libvpx/vp9/decoder/vp9_onyxd_if.c
@@ -13,7 +13,7 @@
 #include <stdio.h>
 
 #include "vp9/common/vp9_onyxc_int.h"
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/decoder/vp9_onyxd.h"
@@ -110,35 +110,36 @@ void vp9_initialize_dec() {
 
 VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
   VP9D_COMP *const pbi = vpx_memalign(32, sizeof(VP9D_COMP));
+  VP9_COMMON *const cm = pbi ? &pbi->common : NULL;
 
-  if (!pbi)
+  if (!cm)
     return NULL;
 
   vp9_zero(*pbi);
 
-  if (setjmp(pbi->common.error.jmp)) {
-    pbi->common.error.setjmp = 0;
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
     vp9_remove_decompressor(pbi);
     return NULL;
   }
 
-  pbi->common.error.setjmp = 1;
+  cm->error.setjmp = 1;
   vp9_initialize_dec();
 
-  vp9_create_common(&pbi->common);
+  vp9_create_common(cm);
 
   pbi->oxcf = *oxcf;
-  pbi->common.current_video_frame = 0;
   pbi->ready_for_new_data = 1;
+  cm->current_video_frame = 0;
 
   // vp9_init_dequantizer() is first called here. Add check in
   // frame_init_dequantizer() to avoid unnecessary calling of
   // vp9_init_dequantizer() for every frame.
-  vp9_init_dequantizer(&pbi->common);
+  vp9_init_dequantizer(cm);
 
-  vp9_loop_filter_init(&pbi->common, &pbi->mb.lf);
+  vp9_loop_filter_init(cm);
 
-  pbi->common.error.setjmp = 0;
+  cm->error.setjmp = 0;
   pbi->decoded_key_frame = 0;
 
   if (pbi->oxcf.max_threads > 1) {
@@ -160,9 +161,6 @@ void vp9_remove_decompressor(VP9D_PTR ptr) {
   if (!pbi)
     return;
 
-  if (pbi->common.last_frame_seg_map)
-    vpx_free(pbi->common.last_frame_seg_map);
-
   vp9_remove_common(&pbi->common);
   vp9_worker_end(&pbi->lf_worker);
   vpx_free(pbi->lf_worker.data1);
@@ -187,21 +185,21 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr,
    * later commit that adds VP9-specific controls for this functionality.
    */
   if (ref_frame_flag == VP9_LAST_FLAG) {
-    ref_fb_idx = pbi->common.ref_frame_map[0];
+    ref_fb_idx = cm->ref_frame_map[0];
   } else {
-    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                        "Invalid reference frame");
-    return pbi->common.error.error_code;
+    return cm->error.error_code;
   }
 
   if (!equal_dimensions(&cm->yv12_fb[ref_fb_idx], sd)) {
-    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   } else {
     vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);
   }
 
-  return pbi->common.error.error_code;
+  return cm->error.error_code;
 }
 
 
@@ -261,22 +259,21 @@ int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) {
 /* If any buffer updating is signaled it should be done here. */
 static void swap_frame_buffers(VP9D_COMP *pbi) {
   int ref_index = 0, mask;
+  VP9_COMMON *const cm = &pbi->common;
 
   for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-    if (mask & 1) {
-      ref_cnt_fb(pbi->common.fb_idx_ref_cnt,
-                 &pbi->common.ref_frame_map[ref_index],
-                 pbi->common.new_fb_idx);
-    }
+    if (mask & 1)
+      ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->ref_frame_map[ref_index],
+                 cm->new_fb_idx);
     ++ref_index;
   }
 
-  pbi->common.frame_to_show = &pbi->common.yv12_fb[pbi->common.new_fb_idx];
-  pbi->common.fb_idx_ref_cnt[pbi->common.new_fb_idx]--;
+  cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+  cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
 
-  /* Invalidate these references until the next frame starts. */
+  // Invalidate these references until the next frame starts.
   for (ref_index = 0; ref_index < 3; ref_index++)
-    pbi->common.active_ref_idx[ref_index] = INT_MAX;
+    cm->active_ref_idx[ref_index] = INT_MAX;
 }
 
 int vp9_receive_compressed_data(VP9D_PTR ptr,
@@ -293,7 +290,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
   if (ptr == 0)
     return -1;
 
-  pbi->common.error.error_code = VPX_CODEC_OK;
+  cm->error.error_code = VPX_CODEC_OK;
 
   pbi->source = source;
   pbi->source_sz = size;
@@ -314,8 +311,8 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
 
   cm->new_fb_idx = get_free_fb(cm);
 
-  if (setjmp(pbi->common.error.jmp)) {
-    pbi->common.error.setjmp = 0;
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
 
     /* We do not know if the missing frame(s) was supposed to update
      * any of the reference buffers, but we act conservative and
@@ -334,13 +331,13 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
     return -1;
   }
 
-  pbi->common.error.setjmp = 1;
+  cm->error.setjmp = 1;
 
   retcode = vp9_decode_frame(pbi, psource);
 
   if (retcode < 0) {
-    pbi->common.error.error_code = VPX_CODEC_ERROR;
-    pbi->common.error.setjmp = 0;
+    cm->error.error_code = VPX_CODEC_ERROR;
+    cm->error.setjmp = 0;
     if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
       cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
     return retcode;
@@ -360,7 +357,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
 
     if (!pbi->do_loopfilter_inline) {
       /* Apply the loop filter if appropriate. */
-      vp9_loop_filter_frame(cm, &pbi->mb, pbi->mb.lf.filter_level, 0);
+      vp9_loop_filter_frame(cm, &pbi->mb, pbi->common.lf.filter_level, 0, 0);
     }
 
 #if WRITE_RECON_BUFFER == 2
@@ -389,12 +386,17 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
   if (cm->show_frame) {
     // current mip will be the prev_mip for the next frame
     MODE_INFO *temp = cm->prev_mip;
+    MODE_INFO **temp2 = cm->prev_mi_grid_base;
     cm->prev_mip = cm->mip;
     cm->mip = temp;
+    cm->prev_mi_grid_base = cm->mi_grid_base;
+    cm->mi_grid_base = temp2;
 
     // update the upper left visible macroblock ptrs
     cm->mi = cm->mip + cm->mode_info_stride + 1;
     cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
+    cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1;
+    cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
 
     cm->current_video_frame++;
   }
@@ -403,7 +405,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
   pbi->last_time_stamp = time_stamp;
   pbi->source_sz = 0;
 
-  pbi->common.error.setjmp = 0;
+  cm->error.setjmp = 0;
   return retcode;
 }
 
@@ -424,15 +426,17 @@ int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd,
   *time_stamp = pbi->last_time_stamp;
   *time_end_stamp = 0;
 
-#if CONFIG_POSTPROC
-  ret = vp9_post_proc_frame(&pbi->common, &pbi->mb.lf, sd, flags);
+#if CONFIG_VP9_POSTPROC
+  ret = vp9_post_proc_frame(&pbi->common, sd, flags);
 #else
 
   if (pbi->common.frame_to_show) {
     *sd = *pbi->common.frame_to_show;
     sd->y_width = pbi->common.width;
     sd->y_height = pbi->common.height;
-    sd->uv_height = pbi->common.height / 2;
+    sd->uv_width = sd->y_width >> pbi->common.subsampling_x;
+    sd->uv_height = sd->y_height >> pbi->common.subsampling_y;
+
     ret = 0;
   } else {
     ret = -1;
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index 98ef420..957cfd2 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -41,9 +41,9 @@ unsigned __int64 Sectionbits[500];
 #endif
 
 #ifdef ENTROPY_STATS
-int intra_mode_stats[VP9_INTRA_MODES]
-                    [VP9_INTRA_MODES]
-                    [VP9_INTRA_MODES];
+int intra_mode_stats[INTRA_MODES]
+                    [INTRA_MODES]
+                    [INTRA_MODES];
 vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
 
 extern unsigned int active_section;
@@ -54,8 +54,8 @@ extern unsigned int active_section;
 int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES];
 int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1];
 int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-int64_t switchable_interp_stats[VP9_SWITCHABLE_FILTERS+1]
-                               [VP9_SWITCHABLE_FILTERS];
+int64_t switchable_interp_stats[SWITCHABLE_FILTERS+1]
+                               [SWITCHABLE_FILTERS];
 
 void init_tx_count_stats() {
   vp9_zero(tx_count_32x32p_stats);
@@ -88,8 +88,8 @@ static void update_tx_count_stats(VP9_COMMON *cm) {
 
 static void update_switchable_interp_stats(VP9_COMMON *cm) {
   int i, j;
-  for (i = 0; i < VP9_SWITCHABLE_FILTERS+1; ++i)
-    for (j = 0; j < VP9_SWITCHABLE_FILTERS; ++j) {
+  for (i = 0; i < SWITCHABLE_FILTERS+1; ++i)
+    for (j = 0; j < SWITCHABLE_FILTERS; ++j) {
       switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j];
     }
 }
@@ -141,11 +141,11 @@ void write_switchable_interp_stats() {
   fclose(fp);
 
   printf(
-      "vp9_default_switchable_filter_count[VP9_SWITCHABLE_FILTERS+1]"
-      "[VP9_SWITCHABLE_FILTERS] = {\n");
-  for (i = 0; i < VP9_SWITCHABLE_FILTERS+1; i++) {
+      "vp9_default_switchable_filter_count[SWITCHABLE_FILTERS+1]"
+      "[SWITCHABLE_FILTERS] = {\n");
+  for (i = 0; i < SWITCHABLE_FILTERS+1; i++) {
     printf("  { ");
-    for (j = 0; j < VP9_SWITCHABLE_FILTERS; j++) {
+    for (j = 0; j < SWITCHABLE_FILTERS; j++) {
       printf("%"PRId64", ", switchable_interp_stats[i][j]);
     }
     printf("},\n");
@@ -181,7 +181,7 @@ static void update_mode(
   n--;
 
   for (i = 0; i < n; ++i) {
-    vp9_cond_prob_diff_update(w, &Pcur[i], VP9_MODE_UPDATE_PROB, bct[i]);
+    vp9_cond_prob_diff_update(w, &Pcur[i], MODE_UPDATE_PROB, bct[i]);
   }
 }
 
@@ -189,19 +189,20 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi,
                                       vp9_writer* const bc) {
   VP9_COMMON *const cm = &cpi->common;
   int j;
-  vp9_prob pnew[VP9_INTRA_MODES - 1];
-  unsigned int bct[VP9_INTRA_MODES - 1][2];
+  vp9_prob pnew[INTRA_MODES - 1];
+  unsigned int bct[INTRA_MODES - 1][2];
 
   for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
-    update_mode(bc, VP9_INTRA_MODES, vp9_intra_mode_tree, pnew,
+    update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, pnew,
                 cm->fc.y_mode_prob[j], bct,
                 (unsigned int *)cpi->y_mode_count[j]);
 }
 
-static void write_selected_tx_size(const VP9_COMP *cpi, TX_SIZE tx_size,
-                                   BLOCK_SIZE_TYPE bsize, vp9_writer *w) {
+static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m,
+                                   TX_SIZE tx_size, BLOCK_SIZE bsize,
+                                   vp9_writer *w) {
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs);
+  const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs, m);
   vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
   if (bsize >= BLOCK_16X16 && tx_size != TX_4X4) {
     vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
@@ -213,10 +214,10 @@ static void write_selected_tx_size(const VP9_COMP *cpi, TX_SIZE tx_size,
 static int write_skip_coeff(const VP9_COMP *cpi, int segment_id, MODE_INFO *m,
                             vp9_writer *w) {
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) {
+  if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
-    const int skip_coeff = m->mbmi.mb_skip_coeff;
+    const int skip_coeff = m->mbmi.skip_coeff;
     vp9_write(w, skip_coeff, vp9_get_pred_prob_mbskip(&cpi->common, xd));
     return skip_coeff;
   }
@@ -228,7 +229,7 @@ void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) {
 
   for (k = 0; k < MBSKIP_CONTEXTS; ++k)
     vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k],
-                              VP9_MODE_UPDATE_PROB, cm->counts.mbskip[k]);
+                              MODE_UPDATE_PROB, cm->counts.mbskip[k]);
 }
 
 static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
@@ -237,43 +238,43 @@ static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
 
 static void update_switchable_interp_probs(VP9_COMP *const cpi,
                                            vp9_writer* const bc) {
-  VP9_COMMON *const pc = &cpi->common;
-  unsigned int branch_ct[VP9_SWITCHABLE_FILTERS + 1]
-                        [VP9_SWITCHABLE_FILTERS - 1][2];
-  vp9_prob new_prob[VP9_SWITCHABLE_FILTERS + 1][VP9_SWITCHABLE_FILTERS - 1];
+  VP9_COMMON *const cm = &cpi->common;
+  unsigned int branch_ct[SWITCHABLE_FILTERS + 1]
+                        [SWITCHABLE_FILTERS - 1][2];
+  vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1];
   int i, j;
-  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
+  for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
     vp9_tree_probs_from_distribution(
         vp9_switchable_interp_tree,
         new_prob[j], branch_ct[j],
-        pc->counts.switchable_interp[j], 0);
+        cm->counts.switchable_interp[j], 0);
   }
-  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
-    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
-      vp9_cond_prob_diff_update(bc, &pc->fc.switchable_interp_prob[j][i],
-                                VP9_MODE_UPDATE_PROB, branch_ct[j][i]);
+  for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
+    for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
+      vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
+                                MODE_UPDATE_PROB, branch_ct[j][i]);
     }
   }
 #ifdef MODE_STATS
   if (!cpi->dummy_packing)
-    update_switchable_interp_stats(pc);
+    update_switchable_interp_stats(cm);
 #endif
 }
 
-static void update_inter_mode_probs(VP9_COMMON *pc, vp9_writer* const bc) {
+static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) {
   int i, j;
 
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
-    unsigned int branch_ct[VP9_INTER_MODES - 1][2];
-    vp9_prob new_prob[VP9_INTER_MODES - 1];
+    unsigned int branch_ct[INTER_MODES - 1][2];
+    vp9_prob new_prob[INTER_MODES - 1];
 
     vp9_tree_probs_from_distribution(vp9_inter_mode_tree,
                                      new_prob, branch_ct,
-                                     pc->counts.inter_mode[i], NEARESTMV);
+                                     cm->counts.inter_mode[i], NEARESTMV);
 
-    for (j = 0; j < VP9_INTER_MODES - 1; ++j)
-      vp9_cond_prob_diff_update(bc, &pc->fc.inter_mode_probs[i][j],
-                                VP9_MODE_UPDATE_PROB, branch_ct[j]);
+    for (j = 0; j < INTER_MODES - 1; ++j)
+      vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
+                                MODE_UPDATE_PROB, branch_ct[j]);
   }
 }
 
@@ -356,39 +357,39 @@ static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
 
 // This function encodes the reference frame
 static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
-  VP9_COMMON *const pc = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *mi = &xd->this_mi->mbmi;
   const int segment_id = mi->segment_id;
-  int seg_ref_active = vp9_segfeature_active(&xd->seg, segment_id,
+  int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
                                              SEG_LVL_REF_FRAME);
   // If segment level coding of this signal is disabled...
   // or the segment allows multiple reference frame options
   if (!seg_ref_active) {
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
-    if (pc->comp_pred_mode == HYBRID_PREDICTION) {
+    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
       vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
-                vp9_get_pred_prob_comp_inter_inter(pc, xd));
+                vp9_get_pred_prob_comp_inter_inter(cm, xd));
     } else {
       assert((mi->ref_frame[1] <= INTRA_FRAME) ==
-                 (pc->comp_pred_mode == SINGLE_PREDICTION_ONLY));
+                 (cm->comp_pred_mode == SINGLE_PREDICTION_ONLY));
     }
 
     if (mi->ref_frame[1] > INTRA_FRAME) {
       vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME,
-                vp9_get_pred_prob_comp_ref_p(pc, xd));
+                vp9_get_pred_prob_comp_ref_p(cm, xd));
     } else {
       vp9_write(bc, mi->ref_frame[0] != LAST_FRAME,
-                vp9_get_pred_prob_single_ref_p1(pc, xd));
+                vp9_get_pred_prob_single_ref_p1(cm, xd));
       if (mi->ref_frame[0] != LAST_FRAME)
         vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME,
-                  vp9_get_pred_prob_single_ref_p2(pc, xd));
+                  vp9_get_pred_prob_single_ref_p2(cm, xd));
     }
   } else {
     assert(mi->ref_frame[1] <= INTRA_FRAME);
-    assert(vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) ==
+    assert(vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ==
            mi->ref_frame[0]);
   }
 
@@ -397,20 +398,20 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
 }
 
 static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
-  VP9_COMMON *const pc = &cpi->common;
-  const nmv_context *nmvc = &pc->fc.nmvc;
+  VP9_COMMON *const cm = &cpi->common;
+  const nmv_context *nmvc = &cm->fc.nmvc;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct segmentation *seg = &xd->seg;
+  struct segmentation *seg = &cm->seg;
   MB_MODE_INFO *const mi = &m->mbmi;
   const MV_REFERENCE_FRAME rf = mi->ref_frame[0];
   const MB_PREDICTION_MODE mode = mi->mode;
   const int segment_id = mi->segment_id;
   int skip_coeff;
-  const BLOCK_SIZE_TYPE bsize = mi->sb_type;
+  const BLOCK_SIZE bsize = mi->sb_type;
   const int allow_hp = xd->allow_high_precision_mv;
 
-  x->partition_info = x->pi + (m - pc->mi);
+  x->partition_info = x->pi + (m - cm->mi);
 
 #ifdef ENTROPY_STATS
   active_section = 9;
@@ -419,7 +420,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
   if (seg->update_map) {
     if (seg->temporal_update) {
       const int pred_flag = mi->seg_id_predicted;
-      vp9_prob pred_prob = vp9_get_pred_prob_seg_id(xd);
+      vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
       vp9_write(bc, pred_flag, pred_prob);
       if (!pred_flag)
         write_segment_id(bc, seg, segment_id);
@@ -432,12 +433,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
 
   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
     vp9_write(bc, rf != INTRA_FRAME,
-              vp9_get_pred_prob_intra_inter(pc, xd));
+              vp9_get_pred_prob_intra_inter(cm, xd));
 
-  if (bsize >= BLOCK_SIZE_SB8X8 && pc->tx_mode == TX_MODE_SELECT &&
+  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
       !(rf != INTRA_FRAME &&
         (skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
-    write_selected_tx_size(cpi, mi->txfm_size, bsize, bc);
+    write_selected_tx_size(cpi, m, mi->tx_size, bsize, bc);
   }
 
   if (rf == INTRA_FRAME) {
@@ -445,8 +446,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
     active_section = 6;
 #endif
 
-    if (bsize >= BLOCK_SIZE_SB8X8) {
-      write_intra_mode(bc, mode, pc->fc.y_mode_prob[size_group_lookup[bsize]]);
+    if (bsize >= BLOCK_8X8) {
+      write_intra_mode(bc, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
     } else {
       int idx, idy;
       const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
@@ -454,15 +455,15 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
       for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
         for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
           const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode;
-          write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]);
+          write_intra_mode(bc, bm, cm->fc.y_mode_prob[0]);
         }
       }
     }
-    write_intra_mode(bc, mi->uv_mode, pc->fc.uv_mode_prob[mode]);
+    write_intra_mode(bc, mi->uv_mode, cm->fc.uv_mode_prob[mode]);
   } else {
     vp9_prob *mv_ref_p;
     encode_ref_frame(cpi, bc);
-    mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mb_mode_context[rf]];
+    mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mode_context[rf]];
 
 #ifdef ENTROPY_STATS
     active_section = 3;
@@ -470,23 +471,23 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
-      if (bsize >= BLOCK_SIZE_SB8X8) {
+      if (bsize >= BLOCK_8X8) {
         write_sb_mv_ref(bc, mode, mv_ref_p);
-        ++pc->counts.inter_mode[mi->mb_mode_context[rf]]
+        ++cm->counts.inter_mode[mi->mode_context[rf]]
                                [inter_mode_offset(mode)];
       }
     }
 
-    if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+    if (cm->mcomp_filter_type == SWITCHABLE) {
+      const int ctx = vp9_get_pred_context_switchable_interp(xd);
       write_token(bc, vp9_switchable_interp_tree,
-                  vp9_get_pred_probs_switchable_interp(&cpi->common, xd),
-                  vp9_switchable_interp_encodings +
-                  vp9_switchable_interp_map[mi->interp_filter]);
+                  cm->fc.switchable_interp_prob[ctx],
+                  &vp9_switchable_interp_encodings[mi->interp_filter]);
     } else {
-      assert(mi->interp_filter == cpi->common.mcomp_filter_type);
+      assert(mi->interp_filter == cm->mcomp_filter_type);
     }
 
-    if (bsize < BLOCK_SIZE_SB8X8) {
+    if (bsize < BLOCK_8X8) {
       int j;
       MB_PREDICTION_MODE blockmode;
       int_mv blockmv;
@@ -499,7 +500,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
           blockmode = x->partition_info->bmi[j].mode;
           blockmv = m->bmi[j].as_mv[0];
           write_sb_mv_ref(bc, blockmode, mv_ref_p);
-          ++pc->counts.inter_mode[mi->mb_mode_context[rf]]
+          ++cm->counts.inter_mode[mi->mode_context[rf]]
                                  [inter_mode_offset(blockmode)];
 
           if (blockmode == NEWMV) {
@@ -531,26 +532,29 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
   }
 }
 
-static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO *m,
+static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
                               vp9_writer *bc) {
-  const VP9_COMMON *const c = &cpi->common;
+  const VP9_COMMON *const cm = &cpi->common;
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const struct segmentation *const seg = &cm->seg;
+  MODE_INFO *m = mi_8x8[0];
   const int ym = m->mbmi.mode;
-  const int mis = c->mode_info_stride;
   const int segment_id = m->mbmi.segment_id;
+  MODE_INFO *above_mi = mi_8x8[-xd->mode_info_stride];
+  MODE_INFO *left_mi = mi_8x8[-1];
 
-  if (xd->seg.update_map)
-    write_segment_id(bc, &xd->seg, m->mbmi.segment_id);
+  if (seg->update_map)
+    write_segment_id(bc, seg, m->mbmi.segment_id);
 
   write_skip_coeff(cpi, segment_id, m, bc);
 
-  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->tx_mode == TX_MODE_SELECT)
-    write_selected_tx_size(cpi, m->mbmi.txfm_size, m->mbmi.sb_type, bc);
+  if (m->mbmi.sb_type >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
+    write_selected_tx_size(cpi, m, m->mbmi.tx_size, m->mbmi.sb_type, bc);
 
-  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
-    const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
+  if (m->mbmi.sb_type >= BLOCK_8X8) {
+    const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0);
     const MB_PREDICTION_MODE L = xd->left_available ?
-                                 left_block_mode(m, 0) : DC_PRED;
+                                 left_block_mode(m, left_mi, 0) : DC_PRED;
     write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]);
   } else {
     int idx, idy;
@@ -558,10 +562,10 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO *m,
     const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[m->mbmi.sb_type];
     for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
       for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
-        const int i = idy * 2 + idx;
-        const MB_PREDICTION_MODE A = above_block_mode(m, i, mis);
+        int i = idy * 2 + idx;
+        const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, i);
         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
-                                     left_block_mode(m, i) : DC_PRED;
+                                     left_block_mode(m, left_mi, i) : DC_PRED;
         const int bm = m->bmi[i].as_mode;
 #ifdef ENTROPY_STATS
         ++intra_mode_stats[A][L][bm];
@@ -574,21 +578,25 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO *m,
   write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]);
 }
 
-static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
+static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,
                           int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MODE_INFO *m = mi_8x8[0];
 
-  if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8)
+  if (m->mbmi.sb_type < BLOCK_8X8)
     if (xd->ab_index > 0)
       return;
-  xd->mode_info_context = m;
-  set_mi_row_col(&cpi->common, xd, mi_row,
-                 1 << mi_height_log2(m->mbmi.sb_type),
-                 mi_col, 1 << mi_width_log2(m->mbmi.sb_type));
+
+  xd->this_mi = mi_8x8[0];
+  xd->mi_8x8 = mi_8x8;
+
+  set_mi_row_col(&cpi->common, xd,
+                 mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
+                 mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type]);
   if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
-    write_mb_modes_kf(cpi, m, bc);
+    write_mb_modes_kf(cpi, mi_8x8, bc);
 #ifdef ENTROPY_STATS
     active_section = 8;
 #endif
@@ -603,10 +611,9 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   pack_mb_tokens(bc, tok, tok_end);
 }
 
-static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
+static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
                            TOKENEXTRA **tok, TOKENEXTRA *tok_end,
-                           int mi_row, int mi_col,
-                           BLOCK_SIZE_TYPE bsize) {
+                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   const int mis = cm->mode_info_stride;
@@ -614,20 +621,22 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   int bs = (1 << bsl) / 4;  // mode_info step for subsize
   int n;
   PARTITION_TYPE partition = PARTITION_NONE;
-  BLOCK_SIZE_TYPE subsize;
+  BLOCK_SIZE subsize;
+  MODE_INFO *m = mi_8x8[0];
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
   partition = partition_lookup[bsl][m->mbmi.sb_type];
 
-  if (bsize < BLOCK_SIZE_SB8X8)
+  if (bsize < BLOCK_8X8)
     if (xd->ab_index > 0)
       return;
 
-  if (bsize >= BLOCK_SIZE_SB8X8) {
+  if (bsize >= BLOCK_8X8) {
     int pl;
-    const int idx = check_bsize_coverage(cm, mi_row, mi_col, bsize);
+    const int idx = check_bsize_coverage(bs, cm->mi_rows, cm->mi_cols,
+                                         mi_row, mi_col);
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
     // encode the partition information
@@ -645,25 +654,26 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
 
   switch (partition) {
     case PARTITION_NONE:
-      write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
       break;
     case PARTITION_HORZ:
-      write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
       *(get_sb_index(xd, subsize)) = 1;
       if ((mi_row + bs) < cm->mi_rows)
-        write_modes_b(cpi, m + bs * mis, bc, tok, tok_end, mi_row + bs, mi_col);
+        write_modes_b(cpi, mi_8x8 + bs * mis, bc, tok, tok_end, mi_row + bs,
+                      mi_col);
       break;
     case PARTITION_VERT:
-      write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
       *(get_sb_index(xd, subsize)) = 1;
       if ((mi_col + bs) < cm->mi_cols)
-        write_modes_b(cpi, m + bs, bc, tok, tok_end, mi_row, mi_col + bs);
+        write_modes_b(cpi, mi_8x8 + bs, bc, tok, tok_end, mi_row, mi_col + bs);
       break;
     case PARTITION_SPLIT:
       for (n = 0; n < 4; n++) {
         int j = n >> 1, i = n & 0x01;
         *(get_sb_index(xd, subsize)) = n;
-        write_modes_sb(cpi, m + j * bs * mis + i * bs, bc, tok, tok_end,
+        write_modes_sb(cpi, mi_8x8 + j * bs * mis + i * bs, bc, tok, tok_end,
                        mi_row + j * bs, mi_col + i * bs, subsize);
       }
       break;
@@ -672,8 +682,8 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   }
 
   // update partition context
-  if (bsize >= BLOCK_SIZE_SB8X8 &&
-      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
+  if (bsize >= BLOCK_8X8 &&
+      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) {
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     update_partition_context(xd, subsize, bsize);
   }
@@ -681,20 +691,23 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
 
 static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
                         TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
-  VP9_COMMON *const c = &cpi->common;
-  const int mis = c->mode_info_stride;
-  MODE_INFO *m, *m_ptr = c->mi;
+  VP9_COMMON *const cm = &cpi->common;
+  const int mis = cm->mode_info_stride;
   int mi_row, mi_col;
-
-  m_ptr += c->cur_tile_mi_col_start + c->cur_tile_mi_row_start * mis;
-
-  for (mi_row = c->cur_tile_mi_row_start; mi_row < c->cur_tile_mi_row_end;
-       mi_row += 8, m_ptr += 8 * mis) {
-    m = m_ptr;
-    vp9_zero(c->left_seg_context);
-    for (mi_col = c->cur_tile_mi_col_start; mi_col < c->cur_tile_mi_col_end;
-         mi_col += MI_BLOCK_SIZE, m += MI_BLOCK_SIZE)
-      write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col, BLOCK_64X64);
+  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
+  MODE_INFO **m_8x8;
+
+  mi_8x8 += cm->cur_tile_mi_col_start + cm->cur_tile_mi_row_start * mis;
+
+  for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
+       mi_row += 8, mi_8x8 += 8 * mis) {
+    m_8x8 = mi_8x8;
+    vp9_zero(cm->left_seg_context);
+    for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+         mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) {
+      write_modes_sb(cpi, m_8x8, bc, tok, tok_end, mi_row, mi_col,
+                     BLOCK_64X64);
+    }
   }
 }
 
@@ -781,94 +794,170 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
   vp9_coeff_probs_model *old_frame_coef_probs =
       cpi->common.fc.coef_probs[tx_size];
   vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size];
-  int i, j, k, l, t;
-  int update[2] = {0, 0};
-  int savings;
-
+  const vp9_prob upd = VP9_COEF_UPDATE_PROB;
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
+  int i, j, k, l, t;
+  switch (cpi->sf.use_fast_coef_updates) {
+    case 0: {
+      /* dry run to see if there is any udpate at all needed */
+      int savings = 0;
+      int update[2] = {0, 0};
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+                const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
+                int s;
+                int u = 0;
+
+                if (l >= 3 && k == 0)
+                  continue;
+                if (t == PIVOT_NODE)
+                  s = vp9_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0],
+                      old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                else
+                  s = vp9_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
+                if (s > 0 && newp != oldp)
+                  u = 1;
+                if (u)
+                  savings += s - (int)(vp9_cost_zero(upd));
+                else
+                  savings -= (int)(vp9_cost_zero(upd));
+                update[u]++;
+              }
+            }
+          }
+        }
+      }
 
-  const int tstart = 0;
-  /* dry run to see if there is any udpate at all needed */
-  savings = 0;
-  for (i = 0; i < BLOCK_TYPES; ++i) {
-    for (j = 0; j < REF_TYPES; ++j) {
-      for (k = 0; k < COEF_BANDS; ++k) {
-        // int prev_coef_savings[ENTROPY_NODES] = {0};
-        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          for (t = tstart; t < entropy_nodes_update; ++t) {
-            vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
-            const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
-            const vp9_prob upd = VP9_COEF_UPDATE_PROB;
-            int s;
-            int u = 0;
-
-            if (l >= 3 && k == 0)
-              continue;
-            if (t == PIVOT_NODE)
-              s = vp9_prob_diff_update_savings_search_model(
-                  frame_branch_ct[i][j][k][l][0],
-                  old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
-            else
-              s = vp9_prob_diff_update_savings_search(
-                  frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
-            if (s > 0 && newp != oldp)
-              u = 1;
-            if (u)
-              savings += s - (int)(vp9_cost_zero(upd));
-            else
-              savings -= (int)(vp9_cost_zero(upd));
-            update[u]++;
+      // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
+      /* Is coef updated at all */
+      if (update[1] == 0 || savings < 0) {
+        vp9_write_bit(bc, 0);
+        return;
+      }
+      vp9_write_bit(bc, 1);
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+              // calc probs and branch cts for this frame only
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+                vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
+                const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+                int s;
+                int u = 0;
+                if (l >= 3 && k == 0)
+                  continue;
+                if (t == PIVOT_NODE)
+                  s = vp9_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0],
+                      old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                else
+                  s = vp9_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t],
+                      *oldp, &newp, upd);
+                if (s > 0 && newp != *oldp)
+                  u = 1;
+                vp9_write(bc, u, upd);
+#ifdef ENTROPY_STATS
+                if (!cpi->dummy_packing)
+                  ++tree_update_hist[tx_size][i][j][k][l][t][u];
+#endif
+                if (u) {
+                  /* send/use new probability */
+                  vp9_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
           }
         }
       }
+      return;
     }
-  }
 
-  // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
-  /* Is coef updated at all */
-  if (update[1] == 0 || savings < 0) {
-    vp9_write_bit(bc, 0);
-    return;
-  }
-  vp9_write_bit(bc, 1);
-  for (i = 0; i < BLOCK_TYPES; ++i) {
-    for (j = 0; j < REF_TYPES; ++j) {
-      for (k = 0; k < COEF_BANDS; ++k) {
-        // int prev_coef_savings[ENTROPY_NODES] = {0};
-        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          // calc probs and branch cts for this frame only
-          for (t = tstart; t < entropy_nodes_update; ++t) {
-            vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
-            vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
-            const vp9_prob upd = VP9_COEF_UPDATE_PROB;
-            int s;
-            int u = 0;
-            if (l >= 3 && k == 0)
-              continue;
-            if (t == PIVOT_NODE)
-              s = vp9_prob_diff_update_savings_search_model(
-                  frame_branch_ct[i][j][k][l][0],
-                  old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
-            else
-              s = vp9_prob_diff_update_savings_search(
-                  frame_branch_ct[i][j][k][l][t],
-                  *oldp, &newp, upd);
-            if (s > 0 && newp != *oldp)
-              u = 1;
-            vp9_write(bc, u, upd);
+    case 1:
+    case 2: {
+      const int prev_coef_contexts_to_update =
+          (cpi->sf.use_fast_coef_updates == 2 ?
+           PREV_COEF_CONTEXTS >> 1 : PREV_COEF_CONTEXTS);
+      const int coef_band_to_update =
+          (cpi->sf.use_fast_coef_updates == 2 ?
+           COEF_BANDS >> 1 : COEF_BANDS);
+      int updates = 0;
+      int noupdates_before_first = 0;
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+              // calc probs and branch cts for this frame only
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+                vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
+                int s;
+                int u = 0;
+                if (l >= 3 && k == 0)
+                  continue;
+                if (l >= prev_coef_contexts_to_update ||
+                    k >= coef_band_to_update) {
+                  u = 0;
+                } else {
+                  if (t == PIVOT_NODE)
+                    s = vp9_prob_diff_update_savings_search_model(
+                        frame_branch_ct[i][j][k][l][0],
+                        old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                  else
+                    s = vp9_prob_diff_update_savings_search(
+                        frame_branch_ct[i][j][k][l][t],
+                        *oldp, &newp, upd);
+                  if (s > 0 && newp != *oldp)
+                    u = 1;
+                }
+                updates += u;
+                if (u == 0 && updates == 0) {
+                  noupdates_before_first++;
 #ifdef ENTROPY_STATS
-            if (!cpi->dummy_packing)
-              ++tree_update_hist[tx_size][i][j][k][l][t][u];
+                  if (!cpi->dummy_packing)
+                    ++tree_update_hist[tx_size][i][j][k][l][t][u];
 #endif
-            if (u) {
-              /* send/use new probability */
-              vp9_write_prob_diff_update(bc, newp, *oldp);
-              *oldp = newp;
+                  continue;
+                }
+                if (u == 1 && updates == 1) {
+                  int v;
+                  // first update
+                  vp9_write_bit(bc, 1);
+                  for (v = 0; v < noupdates_before_first; ++v)
+                    vp9_write(bc, 0, upd);
+                }
+                vp9_write(bc, u, upd);
+#ifdef ENTROPY_STATS
+                if (!cpi->dummy_packing)
+                  ++tree_update_hist[tx_size][i][j][k][l][t][u];
+#endif
+                if (u) {
+                  /* send/use new probability */
+                  vp9_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
             }
           }
         }
       }
+      if (updates == 0) {
+        vp9_write_bit(bc, 0);  // no updates
+      }
+      return;
     }
+
+    default:
+      assert(0);
   }
 }
 
@@ -967,7 +1056,7 @@ static void encode_segmentation(VP9_COMP *cpi,
                                 struct vp9_write_bit_buffer *wb) {
   int i, j;
 
-  struct segmentation *seg = &cpi->mb.e_mbd.seg;
+  struct segmentation *seg = &cpi->common.seg;
 
   vp9_wb_write_bit(wb, seg->enabled);
   if (!seg->enabled)
@@ -1047,7 +1136,7 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
                                      ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; j++)
         vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j],
-                                  VP9_MODE_UPDATE_PROB, ct_8x8p[j]);
+                                  MODE_UPDATE_PROB, ct_8x8p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
@@ -1055,14 +1144,14 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
                                        ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; j++)
         vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
-                                  VP9_MODE_UPDATE_PROB, ct_16x16p[j]);
+                                  MODE_UPDATE_PROB, ct_16x16p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
       tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; j++)
         vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
-                                  VP9_MODE_UPDATE_PROB, ct_32x32p[j]);
+                                  MODE_UPDATE_PROB, ct_32x32p[j]);
     }
 #ifdef MODE_STATS
     if (!cpi->dummy_packing)
@@ -1073,9 +1162,11 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
 
 static void write_interp_filter_type(INTERPOLATIONFILTERTYPE type,
                                      struct vp9_write_bit_buffer *wb) {
+  const int type_to_literal[] = { 1, 0, 2 };
+
   vp9_wb_write_bit(wb, type == SWITCHABLE);
   if (type != SWITCHABLE)
-    vp9_wb_write_literal(wb, type, 2);
+    vp9_wb_write_literal(wb, type_to_literal[type], 2);
 }
 
 static void fix_mcomp_filter_type(VP9_COMP *cpi) {
@@ -1083,19 +1174,19 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) {
 
   if (cm->mcomp_filter_type == SWITCHABLE) {
     // Check to see if only one of the filters is actually used
-    int count[VP9_SWITCHABLE_FILTERS];
+    int count[SWITCHABLE_FILTERS];
     int i, j, c = 0;
-    for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+    for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
       count[i] = 0;
-      for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j)
+      for (j = 0; j <= SWITCHABLE_FILTERS; ++j)
         count[i] += cm->counts.switchable_interp[j][i];
       c += (count[i] > 0);
     }
     if (c == 1) {
       // Only one filter is used. So set the filter at frame level
-      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         if (count[i]) {
-          cm->mcomp_filter_type = vp9_switchable_interp[i];
+          cm->mcomp_filter_type = i;
           break;
         }
       }
@@ -1127,7 +1218,8 @@ static int get_refresh_mask(VP9_COMP *cpi) {
     if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&
         !cpi->refresh_alt_ref_frame) {
 #else
-    if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+    if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame &&
+        !cpi->use_svc) {
 #endif
       // Preserve the previously existing golden frame and update the frame in
       // the alt ref slot instead. This is highly specific to the use of
@@ -1239,9 +1331,16 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
     YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[refs[i]]];
     found = cm->width == cfg->y_crop_width &&
             cm->height == cfg->y_crop_height;
+
+    // TODO(ivan): This prevents a bug while more than 3 buffers are used. Do it
+    // in a better way.
+    if (cpi->use_svc) {
+      found = 0;
+    }
     vp9_wb_write_bit(wb, found);
-    if (found)
+    if (found) {
       break;
+    }
   }
 
   if (!found) {
@@ -1340,7 +1439,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
 
   vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LOG2);
 
-  encode_loopfilter(&xd->lf, wb);
+  encode_loopfilter(&cm->lf, wb);
   encode_quantization(cm, wb);
   encode_segmentation(cpi, wb);
 
@@ -1382,7 +1481,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
-                                VP9_MODE_UPDATE_PROB,
+                                MODE_UPDATE_PROB,
                                 cpi->intra_inter_count[i]);
 
     if (cm->allow_comp_inter_inter) {
@@ -1396,7 +1495,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
         if (use_hybrid_pred)
           for (i = 0; i < COMP_INTER_CONTEXTS; i++)
             vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
-                                      VP9_MODE_UPDATE_PROB,
+                                      MODE_UPDATE_PROB,
                                       cpi->comp_inter_count[i]);
       }
     }
@@ -1404,10 +1503,10 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
     if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
       for (i = 0; i < REF_CONTEXTS; i++) {
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
-                                  VP9_MODE_UPDATE_PROB,
+                                  MODE_UPDATE_PROB,
                                   cpi->single_ref_count[i][0]);
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
-                                  VP9_MODE_UPDATE_PROB,
+                                  MODE_UPDATE_PROB,
                                   cpi->single_ref_count[i][1]);
       }
     }
@@ -1415,7 +1514,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
     if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
       for (i = 0; i < REF_CONTEXTS; i++)
         vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
-                                  VP9_MODE_UPDATE_PROB,
+                                  MODE_UPDATE_PROB,
                                   cpi->comp_ref_count[i]);
 
     update_mbintra_mode_probs(cpi, &header_bc);
@@ -1453,7 +1552,7 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
   vp9_compute_update_table();
 
 #ifdef ENTROPY_STATS
-  if (pc->frame_type == INTER_FRAME)
+  if (cm->frame_type == INTER_FRAME)
     active_section = 0;
   else
     active_section = 7;
diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h
index 3e377cf..013047e 100644
--- a/libvpx/vp9/encoder/vp9_block.h
+++ b/libvpx/vp9/encoder/vp9_block.h
@@ -48,7 +48,11 @@ typedef struct {
   int comp_pred_diff;
   int single_pred_diff;
   int64_t tx_rd_diff[TX_MODES];
-  int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+
+  // motion vector cache for adaptive motion search control in partition
+  // search loop
+  int_mv pred_mv[MAX_REF_FRAMES];
 
   // Bit flag for each mode whether it has high error in comparison to others.
   unsigned int modes_with_high_error;
@@ -121,9 +125,9 @@ struct macroblock {
   int mbmode_cost[MB_MODE_COUNT];
   unsigned inter_mode_cost[INTER_MODE_CONTEXTS][MB_MODE_COUNT - NEARESTMV];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
-  int y_mode_costs[VP9_INTRA_MODES][VP9_INTRA_MODES][VP9_INTRA_MODES];
-  int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
-                             [VP9_SWITCHABLE_FILTERS];
+  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  int switchable_interp_costs[SWITCHABLE_FILTERS + 1]
+                             [SWITCHABLE_FILTERS];
 
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
@@ -144,12 +148,12 @@ struct macroblock {
   int optimize;
 
   // indicate if it is in the rd search loop or encoding process
-  int rd_search;
+  int use_lp32x32fdct;
   int skip_encode;
 
   // Used to store sub partition's choices.
   int fast_ms;
-  int_mv pred_mv;
+  int_mv pred_mv[MAX_REF_FRAMES];
   int subblock_ref;
 
   // TODO(jingning): Need to refactor the structure arrays that buffers the
@@ -170,10 +174,10 @@ struct macroblock {
   PICK_MODE_CONTEXT sb64_context;
   int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
-  BLOCK_SIZE_TYPE b_partitioning[4][4][4];
-  BLOCK_SIZE_TYPE mb_partitioning[4][4];
-  BLOCK_SIZE_TYPE sb_partitioning[4];
-  BLOCK_SIZE_TYPE sb64_partitioning;
+  BLOCK_SIZE b_partitioning[4][4][4];
+  BLOCK_SIZE mb_partitioning[4][4];
+  BLOCK_SIZE sb_partitioning[4];
+  BLOCK_SIZE sb64_partitioning;
 
   void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
diff --git a/libvpx/vp9/encoder/vp9_dct.c b/libvpx/vp9/encoder/vp9_dct.c
index 3112dad..4f4ad04 100644
--- a/libvpx/vp9/encoder/vp9_dct.c
+++ b/libvpx/vp9/encoder/vp9_dct.c
@@ -1077,6 +1077,44 @@ static void dct32_1d(int *input, int *output, int round) {
   output[30] = step[30];
   output[31] = step[31];
 
+  // dump the magnitude by 4, hence the intermediate values are within
+  // the range of 16 bits.
+  if (round) {
+    output[0] = half_round_shift(output[0]);
+    output[1] = half_round_shift(output[1]);
+    output[2] = half_round_shift(output[2]);
+    output[3] = half_round_shift(output[3]);
+    output[4] = half_round_shift(output[4]);
+    output[5] = half_round_shift(output[5]);
+    output[6] = half_round_shift(output[6]);
+    output[7] = half_round_shift(output[7]);
+    output[8] = half_round_shift(output[8]);
+    output[9] = half_round_shift(output[9]);
+    output[10] = half_round_shift(output[10]);
+    output[11] = half_round_shift(output[11]);
+    output[12] = half_round_shift(output[12]);
+    output[13] = half_round_shift(output[13]);
+    output[14] = half_round_shift(output[14]);
+    output[15] = half_round_shift(output[15]);
+
+    output[16] = half_round_shift(output[16]);
+    output[17] = half_round_shift(output[17]);
+    output[18] = half_round_shift(output[18]);
+    output[19] = half_round_shift(output[19]);
+    output[20] = half_round_shift(output[20]);
+    output[21] = half_round_shift(output[21]);
+    output[22] = half_round_shift(output[22]);
+    output[23] = half_round_shift(output[23]);
+    output[24] = half_round_shift(output[24]);
+    output[25] = half_round_shift(output[25]);
+    output[26] = half_round_shift(output[26]);
+    output[27] = half_round_shift(output[27]);
+    output[28] = half_round_shift(output[28]);
+    output[29] = half_round_shift(output[29]);
+    output[30] = half_round_shift(output[30]);
+    output[31] = half_round_shift(output[31]);
+  }
+
   // Stage 3
   step[0] = output[0] + output[(8 - 1)];
   step[1] = output[1] + output[(8 - 2)];
@@ -1112,44 +1150,6 @@ static void dct32_1d(int *input, int *output, int round) {
   step[30] = output[30] + output[25];
   step[31] = output[31] + output[24];
 
-  // dump the magnitude by half, hence the intermediate values are within 1108
-  // the range of 16 bits.
-  if (round) {
-    step[0] = half_round_shift(step[0]);
-    step[1] = half_round_shift(step[1]);
-    step[2] = half_round_shift(step[2]);
-    step[3] = half_round_shift(step[3]);
-    step[4] = half_round_shift(step[4]);
-    step[5] = half_round_shift(step[5]);
-    step[6] = half_round_shift(step[6]);
-    step[7] = half_round_shift(step[7]);
-    step[8] = half_round_shift(step[8]);
-    step[9] = half_round_shift(step[9]);
-    step[10] = half_round_shift(step[10]);
-    step[11] = half_round_shift(step[11]);
-    step[12] = half_round_shift(step[12]);
-    step[13] = half_round_shift(step[13]);
-    step[14] = half_round_shift(step[14]);
-    step[15] = half_round_shift(step[15]);
-
-    step[16] = half_round_shift(step[16]);
-    step[17] = half_round_shift(step[17]);
-    step[18] = half_round_shift(step[18]);
-    step[19] = half_round_shift(step[19]);
-    step[20] = half_round_shift(step[20]);
-    step[21] = half_round_shift(step[21]);
-    step[22] = half_round_shift(step[22]);
-    step[23] = half_round_shift(step[23]);
-    step[24] = half_round_shift(step[24]);
-    step[25] = half_round_shift(step[25]);
-    step[26] = half_round_shift(step[26]);
-    step[27] = half_round_shift(step[27]);
-    step[28] = half_round_shift(step[28]);
-    step[29] = half_round_shift(step[29]);
-    step[30] = half_round_shift(step[30]);
-    step[31] = half_round_shift(step[31]);
-  }
-
   // Stage 4
   output[0] = step[0] + step[3];
   output[1] = step[1] + step[2];
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index 66eae41..44ab02d 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -8,43 +8,55 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vpx_config.h"
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
 #include "./vp9_rtcd.h"
-#include "vp9/encoder/vp9_encodeframe.h"
-#include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_encodemv.h"
+#include "./vpx_config.h"
+
+#include "vpx_ports/vpx_timer.h"
+
 #include "vp9/common/vp9_common.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/common/vp9_extend.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vp9/encoder/vp9_segmentation.h"
-#include "vp9/encoder/vp9_encodeintra.h"
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/common/vp9_extend.h"
 #include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encodeintra.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "./vp9_rtcd.h"
-#include <stdio.h>
-#include <math.h>
-#include <limits.h>
-#include "vpx_ports/vpx_timer.h"
-#include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_mvref_common.h"
 
 #define DBG_PRNT_SEGMAP 0
 
+
+static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
+  TX_4X4,  // ONLY_4X4
+  TX_8X8,  // ONLY_8X8
+  TX_16X16,  // ONLY_16X16
+  TX_32X32,  // ONLY_32X32
+  TX_32X32,  // TX_MODE_SELECT
+};
+
 // #define ENC_DEBUG
 #ifdef ENC_DEBUG
 int enc_debug = 0;
 #endif
 
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
-                              int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize);
+                              int mi_row, int mi_col, BLOCK_SIZE bsize);
 
 static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
 
@@ -53,7 +65,10 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
  * This also avoids the need for divide by zero checks in
  *  vp9_activity_masking().
  */
-#define VP9_ACTIVITY_AVG_MIN (64)
+#define ACTIVITY_AVG_MIN (64)
+
+/* Motion vector component magnitude threshold for defining fast motion. */
+#define FAST_MOTION_MV_THRESH (24)
 
 /* This is used as a reference when computing the source variance for the
  *  purposes of activity masking.
@@ -71,13 +86,14 @@ static const uint8_t VP9_VAR_OFFS[64] = {
   128, 128, 128, 128, 128, 128, 128, 128
 };
 
-static unsigned int get_sb_variance(VP9_COMP *cpi, MACROBLOCK *x,
-                                    BLOCK_SIZE_TYPE bs) {
+static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x,
+                                              BLOCK_SIZE bs) {
   unsigned int var, sse;
   var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
                            x->plane[0].src.stride,
                            VP9_VAR_OFFS, 0, &sse);
-  return var >> num_pels_log2_lookup[bs];
+  return (var + (1 << (num_pels_log2_lookup[bs] - 1))) >>
+      num_pels_log2_lookup[bs];
 }
 
 // Original activity measure from Tim T's code.
@@ -103,31 +119,29 @@ static unsigned int tt_activity_measure(MACROBLOCK *x) {
 }
 
 // Stub for alternative experimental activity measures.
-static unsigned int alt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,
-                                         int use_dc_pred) {
-  return vp9_encode_intra(cpi, x, use_dc_pred);
+static unsigned int alt_activity_measure(MACROBLOCK *x, int use_dc_pred) {
+  return vp9_encode_intra(x, use_dc_pred);
 }
 DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = {0};
 
 // Measure the activity of the current macroblock
 // What we measure here is TBD so abstracted to this function
 #define ALT_ACT_MEASURE 1
-static unsigned int mb_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,
-                                        int mb_row, int mb_col) {
+static unsigned int mb_activity_measure(MACROBLOCK *x, int mb_row, int mb_col) {
   unsigned int mb_activity;
 
   if (ALT_ACT_MEASURE) {
     int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
 
     // Or use and alternative.
-    mb_activity = alt_activity_measure(cpi, x, use_dc_pred);
+    mb_activity = alt_activity_measure(x, use_dc_pred);
   } else {
     // Original activity measure from Tim T's code.
     mb_activity = tt_activity_measure(x);
   }
 
-  if (mb_activity < VP9_ACTIVITY_AVG_MIN)
-    mb_activity = VP9_ACTIVITY_AVG_MIN;
+  if (mb_activity < ACTIVITY_AVG_MIN)
+    mb_activity = ACTIVITY_AVG_MIN;
 
   return mb_activity;
 }
@@ -175,10 +189,10 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
 #else
   // Simple mean for now
   cpi->activity_avg = (unsigned int) (activity_sum / cpi->common.MBs);
-#endif
+#endif  // ACT_MEDIAN
 
-  if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN)
-    cpi->activity_avg = VP9_ACTIVITY_AVG_MIN;
+  if (cpi->activity_avg < ACTIVITY_AVG_MIN)
+    cpi->activity_avg = ACTIVITY_AVG_MIN;
 
   // Experimental code: return fixed value normalized for several clips
   if (ALT_ACT_MEASURE)
@@ -240,7 +254,7 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
 #endif
 
 }
-#endif
+#endif  // USE_ACT_INDEX
 
 // Loop through all MBs. Note activity of each, average activity and
 // calculate a normalized activity for each
@@ -277,7 +291,7 @@ static void build_activity_map(VP9_COMP *cpi) {
 #endif
 
       // measure activity
-      mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col);
+      mb_activity = mb_activity_measure(x, mb_row, mb_col);
 
       // Keep frame sum
       activity_sum += mb_activity;
@@ -331,15 +345,17 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
 }
 
 static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
-                         BLOCK_SIZE_TYPE bsize, int output_enabled) {
+                         BLOCK_SIZE bsize, int output_enabled) {
   int i, x_idx, y;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO * const mbmi = &xd->this_mi->mbmi;
+  MODE_INFO *mi_addr = xd->this_mi;
 
   int mb_mode_index = ctx->best_mode_index;
-  const int mis = cpi->common.mode_info_stride;
+  const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
 
@@ -349,17 +365,16 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
   assert(mi->mbmi.sb_type == bsize);
 
+  *mi_addr = *mi;
+
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
-  for (y = 0; y < mi_height; y++) {
-    for (x_idx = 0; x_idx < mi_width; x_idx++) {
-      if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > x_idx
-          && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > y) {
-        MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis;
-        *mi_addr = *mi;
-      }
-    }
-  }
+  for (y = 0; y < mi_height; y++)
+    for (x_idx = 0; x_idx < mi_width; x_idx++)
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
+          && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y)
+        xd->mi_8x8[x_idx + y * mis] = mi_addr;
+
   // FIXME(rbultje) I'm pretty sure this should go to the end of this block
   // (i.e. after the output_enabled)
   if (bsize < BLOCK_32X32) {
@@ -378,12 +393,12 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   if (!output_enabled)
     return;
 
-  if (!vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+  if (!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     for (i = 0; i < TX_MODES; i++)
       cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i];
   }
 
-  if (cpi->common.frame_type == KEY_FRAME) {
+  if (cm->frame_type == KEY_FRAME) {
     // Restore the coding modes to that held in the coding context
     // if (mb_mode == I4X4_PRED)
     //    for (i = 0; i < 16; i++)
@@ -401,7 +416,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
       THR_D135_PRED /*D135_PRED*/,
       THR_D117_PRED /*D117_PRED*/,
       THR_D153_PRED /*D153_PRED*/,
-      THR_D27_PRED /*D27_PRED*/,
+      THR_D207_PRED /*D207_PRED*/,
       THR_D63_PRED /*D63_PRED*/,
       THR_TM /*TM_PRED*/,
       THR_B_PRED /*I4X4_PRED*/,
@@ -412,7 +427,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     // Note how often each mode chosen as best
     cpi->mode_chosen_counts[mb_mode_index]++;
     if (is_inter_block(mbmi)
-        && (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {
+        && (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
       int_mv best_mv, best_second_mv;
       const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
@@ -427,29 +442,17 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
       vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv);
     }
 
-    if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) {
-      int i, j;
-      for (j = 0; j < mi_height; ++j)
-        for (i = 0; i < mi_width; ++i)
-          if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > i
-              && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > j)
-            xd->mode_info_context[mis * j + i].mbmi = *mbmi;
-    }
-
-    if (cpi->common.mcomp_filter_type == SWITCHABLE
-        && is_inter_mode(mbmi->mode)) {
-      ++cpi->common.counts.switchable_interp[
-          vp9_get_pred_context_switchable_interp(xd)]
-            [vp9_switchable_interp_map[mbmi->interp_filter]];
+    if (cm->mcomp_filter_type == SWITCHABLE && is_inter_mode(mbmi->mode)) {
+      const int ctx = vp9_get_pred_context_switchable_interp(xd);
+      ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
     }
 
     cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
     cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
     cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
 
-    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i <= SWITCHABLE_FILTERS; i++)
       cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
-    }
   }
 }
 
@@ -469,10 +472,10 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
 }
 
 static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
-                        BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCK * const x = &cpi->mb;
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCKD * const xd = &x->e_mbd;
+                        BLOCK_SIZE bsize) {
+  MACROBLOCK *const x = &cpi->mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
   const int dst_fb_idx = cm->new_fb_idx;
   const int idx_str = xd->mode_info_stride * mi_row + mi_col;
@@ -481,18 +484,9 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
   const int mb_row = mi_row >> 1;
   const int mb_col = mi_col >> 1;
   const int idx_map = mb_row * cm->mb_cols + mb_col;
-  const struct segmentation *const seg = &xd->seg;
-  int i;
+  const struct segmentation *const seg = &cm->seg;
 
-  // entropy context structures
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].above_context = cm->above_context[i]
-        + (mi_col * 2 >> xd->plane[i].subsampling_x);
-    xd->plane[i].left_context = cm->left_context[i]
-        + (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
-  }
-
-  // partition contexts
+  set_skip_context(cm, xd, mi_row, mi_col);
   set_partition_seg_context(cm, xd, mi_row, mi_col);
 
   // Activity map pointer
@@ -501,23 +495,28 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
 
   /* pointers to mode info contexts */
   x->partition_info = x->pi + idx_str;
-  xd->mode_info_context = cm->mi + idx_str;
-  mbmi = &xd->mode_info_context->mbmi;
+
+  xd->mi_8x8 = cm->mi_grid_visible + idx_str;
+  xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
+
   // Special case: if prev_mi is NULL, the previous mode info context
   // cannot be used.
-  xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + idx_str : NULL;
+  xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
+
+  xd->this_mi =
+  xd->mi_8x8[0] = cm->mi + idx_str;
+
+  mbmi = &xd->this_mi->mbmi;
 
   // Set up destination pointers
   setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col);
 
-  /* Set up limit values for MV components to prevent them from
-   * extending beyond the UMV borders assuming 16x16 block size */
-  x->mv_row_min = -((mi_row * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-  x->mv_col_min = -((mi_col * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-  x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE
-      + (VP9BORDERINPIXELS - MI_SIZE * mi_height - VP9_INTERP_EXTEND));
-  x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE
-      + (VP9BORDERINPIXELS - MI_SIZE * mi_width - VP9_INTERP_EXTEND));
+  // Set up limit values for MV components
+  // mv beyond the range do not produce new/different prediction block
+  x->mv_row_min = -(((mi_row + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_col_min = -(((mi_col + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_row_max = (cm->mi_rows - mi_row) * MI_SIZE + VP9_INTERP_EXTEND;
+  x->mv_col_max = (cm->mi_cols - mi_col) * MI_SIZE + VP9_INTERP_EXTEND;
 
   // Set up distance of MB to edge of frame in 1/8th pel units
   assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
@@ -564,25 +563,33 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
 
 static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
                           int *totalrate, int64_t *totaldist,
-                          BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx,
+                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                           int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  x->rd_search = 1;
+  // Use the lower precision, but faster, 32x32 fdct for mode selection.
+  x->use_lp32x32fdct = 1;
 
-  if (bsize < BLOCK_SIZE_SB8X8) {
+  if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
-    if (xd->ab_index != 0)
+    if (xd->ab_index != 0) {
+      *totalrate = 0;
+      *totaldist = 0;
       return;
+    }
   }
 
   set_offsets(cpi, mi_row, mi_col, bsize);
-  xd->mode_info_context->mbmi.sb_type = bsize;
+  xd->this_mi->mbmi.sb_type = bsize;
+
+  // Set to zero to make sure we do not use the previous encoded frame stats
+  xd->this_mi->mbmi.skip_coeff = 0;
+
+  x->source_variance = get_sby_perpixel_variance(cpi, x, bsize);
 
-  x->source_variance = get_sb_variance(cpi, x, bsize);
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
     vp9_activity_masking(cpi, x);
 
@@ -600,38 +607,39 @@ static void update_stats(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mode_info_context;
+  MODE_INFO *mi = xd->this_mi;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (cm->frame_type != KEY_FRAME) {
-    const int seg_ref_active = vp9_segfeature_active(&xd->seg, mbmi->segment_id,
+    const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                                      SEG_LVL_REF_FRAME);
 
     if (!seg_ref_active)
-      cpi->intra_inter_count[vp9_get_pred_context_intra_inter(xd)][mbmi
-          ->ref_frame[0] > INTRA_FRAME]++;
+      cpi->intra_inter_count[vp9_get_pred_context_intra_inter(xd)]
+                            [is_inter_block(mbmi)]++;
 
     // If the segment reference feature is enabled we have only a single
     // reference frame allowed for the segment so exclude it from
     // the reference frame counts used to work out probabilities.
-    if ((mbmi->ref_frame[0] > INTRA_FRAME) && !seg_ref_active) {
+    if (is_inter_block(mbmi) && !seg_ref_active) {
       if (cm->comp_pred_mode == HYBRID_PREDICTION)
         cpi->comp_inter_count[vp9_get_pred_context_comp_inter_inter(cm, xd)]
-                              [mbmi->ref_frame[1] > INTRA_FRAME]++;
+                             [has_second_ref(mbmi)]++;
 
-      if (mbmi->ref_frame[1] > INTRA_FRAME) {
-        cpi->comp_ref_count[vp9_get_pred_context_comp_ref_p(cm, xd)][mbmi
-            ->ref_frame[0] == GOLDEN_FRAME]++;
+      if (has_second_ref(mbmi)) {
+        cpi->comp_ref_count[vp9_get_pred_context_comp_ref_p(cm, xd)]
+                           [mbmi->ref_frame[0] == GOLDEN_FRAME]++;
       } else {
-        cpi->single_ref_count[vp9_get_pred_context_single_ref_p1(xd)]
-                              [0][mbmi->ref_frame[0] != LAST_FRAME]++;
+        cpi->single_ref_count[vp9_get_pred_context_single_ref_p1(xd)][0]
+                             [mbmi->ref_frame[0] != LAST_FRAME]++;
         if (mbmi->ref_frame[0] != LAST_FRAME)
           cpi->single_ref_count[vp9_get_pred_context_single_ref_p2(xd)][1]
-              [mbmi->ref_frame[0] != GOLDEN_FRAME]++;
+                               [mbmi->ref_frame[0] != GOLDEN_FRAME]++;
       }
     }
+
     // Count of last ref frame 0,0 usage
-    if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame[0] == LAST_FRAME))
+    if (mbmi->mode == ZEROMV && mbmi->ref_frame[0] == LAST_FRAME)
       cpi->inter_zz_count++;
   }
 }
@@ -639,9 +647,8 @@ static void update_stats(VP9_COMP *cpi) {
 // TODO(jingning): the variables used here are little complicated. need further
 // refactoring on organizing the temporary buffers, when recursive
 // partition down to 4x4 block size is enabled.
-static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
-                                            BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD * const xd = &x->e_mbd;
+static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
 
   switch (bsize) {
     case BLOCK_64X64:
@@ -676,9 +683,8 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
   }
 }
 
-static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
-                                            BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *xd = &x->e_mbd;
+static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   switch (bsize) {
     case BLOCK_64X64:
       return &x->sb64_partitioning;
@@ -698,7 +704,7 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
                             ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                             ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                             PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
-                            BLOCK_SIZE_TYPE bsize) {
+                            BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -729,7 +735,7 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                          PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
-                         BLOCK_SIZE_TYPE bsize) {
+                         BLOCK_SIZE bsize) {
   const VP9_COMMON *const cm = &cpi->common;
   const MACROBLOCK *const x = &cpi->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
@@ -760,7 +766,7 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
 }
 
 static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
-                     int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) {
+                     int output_enabled, BLOCK_SIZE bsize, int sub_index) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD * const xd = &x->e_mbd;
@@ -769,9 +775,9 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
     return;
 
   if (sub_index != -1)
-    *(get_sb_index(xd, bsize)) = sub_index;
+    *get_sb_index(xd, bsize) = sub_index;
 
-  if (bsize < BLOCK_SIZE_SB8X8) {
+  if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
     if (xd->ab_index > 0)
@@ -790,22 +796,22 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
 }
 
 static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
-                      int output_enabled, BLOCK_SIZE_TYPE bsize) {
+                      int output_enabled, BLOCK_SIZE bsize) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD * const xd = &x->e_mbd;
-  BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
+  BLOCK_SIZE c1 = BLOCK_8X8;
   const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
-  int UNINITIALIZED_IS_SAFE(pl);
+  int pl = 0;
   PARTITION_TYPE partition;
-  BLOCK_SIZE_TYPE subsize;
+  BLOCK_SIZE subsize;
   int i;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
   c1 = BLOCK_4X4;
-  if (bsize >= BLOCK_SIZE_SB8X8) {
+  if (bsize >= BLOCK_8X8) {
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
     c1 = *(get_sb_partitioning(x, bsize));
@@ -814,7 +820,7 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
 
   switch (partition) {
     case PARTITION_NONE:
-      if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)
+      if (output_enabled && bsize >= BLOCK_8X8)
         cpi->partition_count[pl][PARTITION_NONE]++;
       encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
       break;
@@ -839,7 +845,7 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
       for (i = 0; i < 4; i++) {
         const int x_idx = i & 1, y_idx = i >> 1;
 
-        *(get_sb_index(xd, subsize)) = i;
+        *get_sb_index(xd, subsize) = i;
         encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
                   output_enabled, subsize);
       }
@@ -849,52 +855,114 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
       break;
   }
 
-  if (partition != PARTITION_SPLIT || bsize == BLOCK_SIZE_SB8X8) {
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) {
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     update_partition_context(xd, c1, bsize);
   }
 }
 
-static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
-                             BLOCK_SIZE_TYPE bsize) {
+// Check to see if the given partition size is allowed for a specified number
+// of 8x8 block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
+                                      int rows_left, int cols_left,
+                                      int *bh, int *bw) {
+  if ((rows_left <= 0) || (cols_left <= 0)) {
+    return MIN(bsize, BLOCK_8X8);
+  } else {
+    for (; bsize > 0; --bsize) {
+      *bh = num_8x8_blocks_high_lookup[bsize];
+      *bw = num_8x8_blocks_wide_lookup[bsize];
+      if ((*bh <= rows_left) && (*bw <= cols_left)) {
+        break;
+      }
+    }
+  }
+  return bsize;
+}
+
+// This function attempts to set all mode info entries in a given SB64
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static void set_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+                             int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
+  BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
   const int mis = cm->mode_info_stride;
+  int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row;
+  int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col;
   int block_row, block_col;
-  for (block_row = 0; block_row < 8; ++block_row) {
-    for (block_col = 0; block_col < 8; ++block_col) {
-      m[block_row * mis + block_col].mbmi.sb_type = bsize;
+  MODE_INFO * mi_upper_left = cm->mi + mi_row * mis + mi_col;
+  int bh = num_8x8_blocks_high_lookup[bsize];
+  int bw = num_8x8_blocks_wide_lookup[bsize];
+
+  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+  // Apply the requested partition size to the SB64 if it is all "in image"
+  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+      (row8x8_remaining >= MI_BLOCK_SIZE)) {
+    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
+      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
+        int index = block_row * mis + block_col;
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->mbmi.sb_type = bsize;
+      }
+    }
+  } else {
+    // Else this is a partial SB64.
+    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
+      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
+        int index = block_row * mis + block_col;
+        // Find a partition size that fits
+        bsize = find_partition_size(cpi->sf.always_this_block_size,
+                                    (row8x8_remaining - block_row),
+                                    (col8x8_remaining - block_col), &bh, &bw);
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->mbmi.sb_type = bsize;
+      }
     }
   }
 }
-static void copy_partitioning(VP9_COMP *cpi, MODE_INFO *m, MODE_INFO *p) {
+
+static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+                              MODE_INFO **prev_mi_8x8) {
   VP9_COMMON *const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
   int block_row, block_col;
+
   for (block_row = 0; block_row < 8; ++block_row) {
     for (block_col = 0; block_col < 8; ++block_col) {
-      m[block_row * mis + block_col].mbmi.sb_type =
-          p[block_row * mis + block_col].mbmi.sb_type;
+      MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col];
+      BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
+      int offset;
+
+      if (prev_mi) {
+        offset = prev_mi - cm->prev_mi;
+        mi_8x8[block_row * mis + block_col] = cm->mi + offset;
+        mi_8x8[block_row * mis + block_col]->mbmi.sb_type = sb_type;
+      }
     }
   }
 }
 
-static void set_block_size(VP9_COMMON * const cm, MODE_INFO *m,
-                           BLOCK_SIZE_TYPE bsize, int mis, int mi_row,
+static void set_block_size(VP9_COMMON * const cm, MODE_INFO **mi_8x8,
+                           BLOCK_SIZE bsize, int mis, int mi_row,
                            int mi_col) {
-  int row, col;
-  int bwl = b_width_log2(bsize);
-  int bhl = b_height_log2(bsize);
-  int bsl = (bwl > bhl ? bwl : bhl);
-
-  int bs = (1 << bsl) / 2;  // Block size in units of 8 pels.
-  MODE_INFO *m2 = m + mi_row * mis + mi_col;
-  for (row = 0; row < bs; row++) {
-    for (col = 0; col < bs; col++) {
-      if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols)
-        continue;
-      m2[row * mis + col].mbmi.sb_type = bsize;
-    }
-  }
+  int r, c;
+  const int bs = MAX(num_8x8_blocks_wide_lookup[bsize],
+                     num_8x8_blocks_high_lookup[bsize]);
+  const int idx_str = mis * mi_row + mi_col;
+  MODE_INFO **const mi2 = &mi_8x8[idx_str];
+
+  mi2[0] = cm->mi + idx_str;
+  mi2[0]->mbmi.sb_type = bsize;
+
+  for (r = 0; r < bs; r++)
+    for (c = 0; c < bs; c++)
+      if (mi_row + r < cm->mi_rows && mi_col + c < cm->mi_cols)
+        mi2[r * mis + c] = mi2[0];
 }
 
 typedef struct {
@@ -931,9 +999,9 @@ typedef enum {
   V64X64,
 } TREE_LEVEL;
 
-static void tree_to_node(void *data, BLOCK_SIZE_TYPE block_size, vt_node *node) {
+static void tree_to_node(void *data, BLOCK_SIZE bsize, vt_node *node) {
   int i;
-  switch (block_size) {
+  switch (bsize) {
     case BLOCK_64X64: {
       v64x64 *vt = (v64x64 *) data;
       node->vt = &vt->vt;
@@ -990,9 +1058,9 @@ void sum_2_variances(var *r, var *a, var*b) {
                 a->sum_error + b->sum_error, a->count + b->count);
 }
 
-static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) {
+static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
   vt_node node;
-  tree_to_node(data, block_size, &node);
+  tree_to_node(data, bsize, &node);
   sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]);
   sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]);
   sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]);
@@ -1002,7 +1070,7 @@ static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) {
 
 #if PERFORM_RANDOM_PARTITIONING
 static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
-    BLOCK_SIZE_TYPE block_size, int mi_row,
+    BLOCK_SIZE block_size, int mi_row,
     int mi_col, int mi_size) {
   VP9_COMMON * const cm = &cpi->common;
   vt_node vt;
@@ -1038,30 +1106,30 @@ static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
   return 0;
 }
 
-#else
+#else  // !PERFORM_RANDOM_PARTITIONING
 
-static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
-                               BLOCK_SIZE_TYPE block_size, int mi_row,
+static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO **m,
+                               BLOCK_SIZE bsize, int mi_row,
                                int mi_col, int mi_size) {
   VP9_COMMON * const cm = &cpi->common;
   vt_node vt;
   const int mis = cm->mode_info_stride;
   int64_t threshold = 50 * cpi->common.base_qindex;
 
-  tree_to_node(data, block_size, &vt);
+  tree_to_node(data, bsize, &vt);
 
   // split none is available only if we have more than half a block size
   // in width and height inside the visible image
   if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows
       && vt.vt->none.variance < threshold) {
-    set_block_size(cm, m, block_size, mis, mi_row, mi_col);
+    set_block_size(cm, m, bsize, mis, mi_row, mi_col);
     return 1;
   }
 
   // vertical split is available on all but the bottom border
   if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
       && vt.vt->vert[1].variance < threshold) {
-    set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
+    set_block_size(cm, m, get_subsize(bsize, PARTITION_VERT), mis, mi_row,
                    mi_col);
     return 1;
   }
@@ -1069,17 +1137,17 @@ static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
   // horizontal split is available on all but the right border
   if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
       && vt.vt->horz[1].variance < threshold) {
-    set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
+    set_block_size(cm, m, get_subsize(bsize, PARTITION_HORZ), mis, mi_row,
                    mi_col);
     return 1;
   }
 
   return 0;
 }
-#endif
+#endif  // PERFORM_RANDOM_PARTITIONING
 
-static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
-                                int mi_col) {
+static void choose_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+                                int mi_row, int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK *x = &cpi->mb;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
@@ -1095,7 +1163,7 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
   int pixels_wide = 64, pixels_high = 64;
 
   vp9_zero(vt);
-  set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+  set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
 
   if (xd->mb_to_right_edge < 0)
     pixels_wide += (xd->mb_to_right_edge >> 3);
@@ -1122,13 +1190,16 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
                      &xd->scale_factor[0]);
     setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
                      &xd->scale_factor[1]);
-    xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME;
-    xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64;
-    vp9_find_best_ref_mvs(xd, m->mbmi.ref_mvs[m->mbmi.ref_frame[0]],
+
+    xd->this_mi->mbmi.ref_frame[0] = LAST_FRAME;
+    xd->this_mi->mbmi.sb_type = BLOCK_64X64;
+    vp9_find_best_ref_mvs(xd,
+                          mi_8x8[0]->mbmi.ref_mvs[mi_8x8[0]->mbmi.ref_frame[0]],
                           &nearest_mv, &near_mv);
 
-    xd->mode_info_context->mbmi.mv[0] = nearest_mv;
-    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+    xd->this_mi->mbmi.mv[0] = nearest_mv;
+    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64);
+
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
   }
@@ -1165,24 +1236,24 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
   // Now go through the entire structure,  splitting every block size until
   // we get to one that's got a variance lower than our threshold,  or we
   // hit 8x8.
-  if (!set_vt_partitioning(cpi, &vt, m, BLOCK_64X64, mi_row, mi_col,
+  if (!set_vt_partitioning(cpi, &vt, mi_8x8, BLOCK_64X64, mi_row, mi_col,
                            4)) {
     for (i = 0; i < 4; ++i) {
       const int x32_idx = ((i & 1) << 2);
       const int y32_idx = ((i >> 1) << 2);
-      if (!set_vt_partitioning(cpi, &vt.split[i], m, BLOCK_32X32,
+      if (!set_vt_partitioning(cpi, &vt.split[i], mi_8x8, BLOCK_32X32,
                                (mi_row + y32_idx), (mi_col + x32_idx), 2)) {
         for (j = 0; j < 4; ++j) {
           const int x16_idx = ((j & 1) << 1);
           const int y16_idx = ((j >> 1) << 1);
-          if (!set_vt_partitioning(cpi, &vt.split[i].split[j], m,
+          if (!set_vt_partitioning(cpi, &vt.split[i].split[j], mi_8x8,
                                    BLOCK_16X16,
                                    (mi_row + y32_idx + y16_idx),
                                    (mi_col + x32_idx + x16_idx), 1)) {
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
-              set_block_size(cm, m, BLOCK_8X8, mis,
+              set_block_size(cm, mi_8x8, BLOCK_8X8, mis,
                              (mi_row + y32_idx + y16_idx + y8_idx),
                              (mi_col + x32_idx + x16_idx + x8_idx));
             }
@@ -1193,9 +1264,10 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
   }
 }
 
-static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
-                             int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
-                             int *rate, int64_t *dist, int do_recon) {
+static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+                             TOKENEXTRA **tp, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, int *rate, int64_t *dist,
+                             int do_recon) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
@@ -1208,7 +1280,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   int bss = (1 << bsl) / 4;
   int i, pl;
   PARTITION_TYPE partition = PARTITION_NONE;
-  BLOCK_SIZE_TYPE subsize;
+  BLOCK_SIZE subsize;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   int last_part_rate = INT_MAX;
@@ -1219,9 +1291,9 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   int64_t none_dist = INT_MAX;
   int chosen_rate = INT_MAX;
   int64_t chosen_dist = INT_MAX;
-  BLOCK_SIZE_TYPE sub_subsize = BLOCK_4X4;
+  BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
-  BLOCK_SIZE_TYPE bs_type = m->mbmi.sb_type;
+  BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -1230,7 +1302,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
 
   subsize = get_subsize(bsize, partition);
 
-  if (bsize < BLOCK_SIZE_SB8X8) {
+  if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
     if (xd->ab_index != 0) {
@@ -1244,17 +1316,17 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
   x->fast_ms = 0;
-  x->pred_mv.as_int = 0;
   x->subblock_ref = 0;
 
   if (cpi->sf.adjust_partitioning_from_last_frame) {
     // Check if any of the sub blocks are further split.
-    if (partition == PARTITION_SPLIT && subsize > BLOCK_SIZE_SB8X8) {
+    if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
       sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
       splits_below = 1;
       for (i = 0; i < 4; i++) {
         int jj = i >> 1, ii = i & 0x01;
-        if (m[jj * bss * mis + ii * bss].mbmi.sb_type >= sub_subsize)  {
+        MODE_INFO * this_mi = mi_8x8[jj * bss * mis + ii * bss];
+        if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
           splits_below = 0;
         }
       }
@@ -1274,7 +1346,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
       none_rate += x->partition_cost[pl][PARTITION_NONE];
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
-      m->mbmi.sb_type = bs_type;
+      mi_8x8[0]->mbmi.sb_type = bs_type;
       *(get_sb_partitioning(x, bsize)) = subsize;
     }
   }
@@ -1285,16 +1357,16 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
                     bsize, get_block_context(x, bsize), INT64_MAX);
       break;
     case PARTITION_HORZ:
-      *(get_sb_index(xd, subsize)) = 0;
+      *get_sb_index(xd, subsize) = 0;
       pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     subsize, get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
-          bsize >= BLOCK_SIZE_SB8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
+          bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
         int rt = 0;
         int64_t dt = 0;
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-        *(get_sb_index(xd, subsize)) = 1;
+        *get_sb_index(xd, subsize) = 1;
         pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
                       get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
@@ -1308,16 +1380,16 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
       }
       break;
     case PARTITION_VERT:
-      *(get_sb_index(xd, subsize)) = 0;
+      *get_sb_index(xd, subsize) = 0;
       pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     subsize, get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
-          bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
+          bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
         int rt = 0;
         int64_t dt = 0;
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-        *(get_sb_index(xd, subsize)) = 1;
+        *get_sb_index(xd, subsize) = 1;
         pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
                       get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
@@ -1343,10 +1415,11 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
 
-        *(get_sb_index(xd, subsize)) = i;
+        *get_sb_index(xd, subsize) = i;
 
-        rd_use_partition(cpi, m + jj * bss * mis + ii * bss, tp, mi_row + y_idx,
-                         mi_col + x_idx, subsize, &rt, &dt, i != 3);
+        rd_use_partition(cpi, mi_8x8 + jj * bss * mis + ii * bss, tp,
+                         mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
+                         i != 3);
         if (rt == INT_MAX || dt == INT_MAX) {
           last_part_rate = INT_MAX;
           last_part_dist = INT_MAX;
@@ -1365,10 +1438,10 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
     last_part_rate += x->partition_cost[pl][partition];
 
   if (cpi->sf.adjust_partitioning_from_last_frame
-      && partition != PARTITION_SPLIT && bsize > BLOCK_SIZE_SB8X8
+      && partition != PARTITION_SPLIT && bsize > BLOCK_8X8
       && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows)
       && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) {
-    BLOCK_SIZE_TYPE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
+    BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
     split_rate = 0;
     split_dist = 0;
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1386,9 +1459,9 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
           || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-      *(get_sb_index(xd, split_subsize)) = i;
-      *(get_sb_partitioning(x, bsize)) = split_subsize;
-      *(get_sb_partitioning(x, split_subsize)) = split_subsize;
+      *get_sb_index(xd, split_subsize) = i;
+      *get_sb_partitioning(x, bsize) = split_subsize;
+      *get_sb_partitioning(x, split_subsize) = split_subsize;
 
       save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
@@ -1427,8 +1500,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   // If last_part is better set the partitioning to that...
   if (RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist)
       < RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)) {
-    m->mbmi.sb_type = bsize;
-    if (bsize >= BLOCK_SIZE_SB8X8)
+    mi_8x8[0]->mbmi.sb_type = bsize;
+    if (bsize >= BLOCK_8X8)
       *(get_sb_partitioning(x, bsize)) = subsize;
     chosen_rate = last_part_rate;
     chosen_dist = last_part_dist;
@@ -1436,7 +1509,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   // If none was better set the partitioning to that...
   if (RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)
       > RDCOST(x->rdmult, x->rddiv, none_rate, none_dist)) {
-    if (bsize >= BLOCK_SIZE_SB8X8)
+    if (bsize >= BLOCK_8X8)
       *(get_sb_partitioning(x, bsize)) = bsize;
     chosen_rate = none_rate;
     chosen_dist = none_dist;
@@ -1446,37 +1519,68 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
-  if ( bsize == BLOCK_SIZE_SB64X64)
+  if ( bsize == BLOCK_64X64)
     assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
 
   if (do_recon)
-    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
+    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
 
   *rate = chosen_rate;
   *dist = chosen_dist;
 }
 
-static BLOCK_SIZE_TYPE min_partition_size[BLOCK_SIZE_TYPES] =
-  { BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
-    BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8,
-    BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16 };
-static BLOCK_SIZE_TYPE max_partition_size[BLOCK_SIZE_TYPES] =
-  { BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
-    BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
-    BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64 };
+static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
+  BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
+  BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8,
+  BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16
+};
+
+static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
+  BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+  BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
+  BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64
+};
 
+// Look at all the mode_info entries for blocks that are part of this
+// partition and find the min and max values for sb_type.
+// At the moment this is designed to work on a 64x64 SB but could be
+// adjusted to use a size parameter.
+//
+// The min and max are assumed to have been initialized prior to calling this
+// function so repeat calls can accumulate a min and max of more than one sb64.
+static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8,
+                                        BLOCK_SIZE * min_block_size,
+                                        BLOCK_SIZE * max_block_size ) {
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  int sb_width_in_blocks = MI_BLOCK_SIZE;
+  int sb_height_in_blocks  = MI_BLOCK_SIZE;
+  int i, j;
+  int index = 0;
+
+  // Check the sb_type for each block that belongs to this region.
+  for (i = 0; i < sb_height_in_blocks; ++i) {
+    for (j = 0; j < sb_width_in_blocks; ++j) {
+      MODE_INFO * mi = mi_8x8[index+j];
+      BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0;
+      *min_block_size = MIN(*min_block_size, sb_type);
+      *max_block_size = MAX(*max_block_size, sb_type);
+    }
+    index += xd->mode_info_stride;
+  }
+}
 
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
-static void rd_auto_partition_range(VP9_COMP *cpi,
-                                    BLOCK_SIZE_TYPE * min_block_size,
-                                    BLOCK_SIZE_TYPE * max_block_size) {
+static void rd_auto_partition_range(VP9_COMP *cpi, int row, int col,
+                                    BLOCK_SIZE *min_block_size,
+                                    BLOCK_SIZE *max_block_size) {
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const MODE_INFO *const mi = xd->mode_info_context;
-  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
-  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
-  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
-  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+  MODE_INFO ** mi_8x8 = xd->mi_8x8;
+  const int left_in_image = xd->left_available && mi_8x8[-1];
+  const int above_in_image = xd->up_available &&
+                             mi_8x8[-xd->mode_info_stride];
+  MODE_INFO ** above_sb64_mi_8x8;
+  MODE_INFO ** left_sb64_mi_8x8;
 
   // Frequency check
   if (cpi->sf.auto_min_max_partition_count <= 0) {
@@ -1484,51 +1588,182 @@ static void rd_auto_partition_range(VP9_COMP *cpi,
       cpi->sf.auto_min_max_partition_interval;
     *min_block_size = BLOCK_4X4;
     *max_block_size = BLOCK_64X64;
-    return;
   } else {
     --cpi->sf.auto_min_max_partition_count;
+
+    // Set default values if no left or above neighbour
+    if (!left_in_image && !above_in_image) {
+      *min_block_size = BLOCK_4X4;
+      *max_block_size = BLOCK_64X64;
+    } else {
+      VP9_COMMON *const cm = &cpi->common;
+      int row8x8_remaining = cm->cur_tile_mi_row_end - row;
+      int col8x8_remaining = cm->cur_tile_mi_col_end - col;
+      int bh, bw;
+
+      // Default "min to max" and "max to min"
+      *min_block_size = BLOCK_64X64;
+      *max_block_size = BLOCK_4X4;
+
+      // Find the min and max partition sizes used in the left SB64
+      if (left_in_image) {
+        left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE];
+        get_sb_partition_size_range(cpi, left_sb64_mi_8x8,
+                                    min_block_size, max_block_size);
+      }
+
+      // Find the min and max partition sizes used in the above SB64 taking
+      // the values found for left as a starting point.
+      if (above_in_image) {
+        above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE];
+        get_sb_partition_size_range(cpi, above_sb64_mi_8x8,
+                                    min_block_size, max_block_size);
+      }
+
+      // Give a bit of leaway either side of the observed min and max
+      *min_block_size = min_partition_size[*min_block_size];
+      *max_block_size = max_partition_size[*max_block_size];
+
+      // Check border cases where max and min from neighbours may not be legal.
+      *max_block_size = find_partition_size(*max_block_size,
+                                            row8x8_remaining, col8x8_remaining,
+                                            &bh, &bw);
+      *min_block_size = MIN(*min_block_size, *max_block_size);
+    }
   }
+}
 
-  // Check for edge cases
-  if (!left_in_image && !above_in_image) {
-    *min_block_size = BLOCK_4X4;
-    *max_block_size = BLOCK_64X64;
-  } else if (!left_in_image) {
-    *min_block_size = min_partition_size[above_mbmi->sb_type];
-    *max_block_size = max_partition_size[above_mbmi->sb_type];
-  } else if (!above_in_image) {
-    *min_block_size = min_partition_size[left_mbmi->sb_type];
-    *max_block_size = max_partition_size[left_mbmi->sb_type];
-  } else {
-    *min_block_size =
-      min_partition_size[MIN(left_mbmi->sb_type, above_mbmi->sb_type)];
-    *max_block_size =
-      max_partition_size[MAX(left_mbmi->sb_type, above_mbmi->sb_type)];
+static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // Only use 8x8 result for non HD videos.
+  // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0;
+  int use_8x8 = 1;
+
+  if (cm->frame_type && !cpi->is_src_frame_alt_ref &&
+      ((use_8x8 && bsize == BLOCK_16X16) ||
+      bsize == BLOCK_32X32 || bsize == BLOCK_64X64)) {
+    int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0;
+    PICK_MODE_CONTEXT *block_context = NULL;
+
+    if (bsize == BLOCK_16X16) {
+      block_context = x->sb8x8_context[xd->sb_index][xd->mb_index];
+    } else if (bsize == BLOCK_32X32) {
+      block_context = x->mb_context[xd->sb_index];
+    } else if (bsize == BLOCK_64X64) {
+      block_context = x->sb32_context;
+    }
+
+    if (block_context) {
+      ref0 = block_context[0].mic.mbmi.ref_frame[0];
+      ref1 = block_context[1].mic.mbmi.ref_frame[0];
+      ref2 = block_context[2].mic.mbmi.ref_frame[0];
+      ref3 = block_context[3].mic.mbmi.ref_frame[0];
+    }
+
+    // Currently, only consider 4 inter reference frames.
+    if (ref0 && ref1 && ref2 && ref3) {
+      int d01, d23, d02, d13;
+
+      // Motion vectors for the four subblocks.
+      int16_t mvr0 = block_context[0].mic.mbmi.mv[0].as_mv.row;
+      int16_t mvc0 = block_context[0].mic.mbmi.mv[0].as_mv.col;
+      int16_t mvr1 = block_context[1].mic.mbmi.mv[0].as_mv.row;
+      int16_t mvc1 = block_context[1].mic.mbmi.mv[0].as_mv.col;
+      int16_t mvr2 = block_context[2].mic.mbmi.mv[0].as_mv.row;
+      int16_t mvc2 = block_context[2].mic.mbmi.mv[0].as_mv.col;
+      int16_t mvr3 = block_context[3].mic.mbmi.mv[0].as_mv.row;
+      int16_t mvc3 = block_context[3].mic.mbmi.mv[0].as_mv.col;
+
+      // Adjust sign if ref is alt_ref.
+      if (cm->ref_frame_sign_bias[ref0]) {
+        mvr0 *= -1;
+        mvc0 *= -1;
+      }
+
+      if (cm->ref_frame_sign_bias[ref1]) {
+        mvr1 *= -1;
+        mvc1 *= -1;
+      }
+
+      if (cm->ref_frame_sign_bias[ref2]) {
+        mvr2 *= -1;
+        mvc2 *= -1;
+      }
+
+      if (cm->ref_frame_sign_bias[ref3]) {
+        mvr3 *= -1;
+        mvc3 *= -1;
+      }
+
+      // Calculate mv distances.
+      d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1));
+      d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3));
+      d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2));
+      d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3));
+
+      if (d01 < FAST_MOTION_MV_THRESH && d23 < FAST_MOTION_MV_THRESH &&
+          d02 < FAST_MOTION_MV_THRESH && d13 < FAST_MOTION_MV_THRESH) {
+        // Set fast motion search level.
+        x->fast_ms = 1;
+
+        if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 &&
+            d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) {
+          // Set fast motion search level.
+          x->fast_ms = 2;
+
+          if (!d01 && !d23 && !d02 && !d13) {
+            x->fast_ms = 3;
+            x->subblock_ref = ref0;
+          }
+        }
+      }
+    }
   }
 }
 
+static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+  vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
+}
+
+static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+  vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
+}
+
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
 static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
-                              int mi_col, BLOCK_SIZE_TYPE bsize, int *rate,
+                              int mi_col, BLOCK_SIZE bsize, int *rate,
                               int64_t *dist, int do_recon, int64_t best_rd) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD * const xd = &x->e_mbd;
-  int bsl = b_width_log2(bsize), bs = 1 << bsl;
-  int ms = bs / 2;
+  const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   TOKENEXTRA *tp_orig = *tp;
   int i, pl;
-  BLOCK_SIZE_TYPE subsize;
-  int srate = INT_MAX;
-  int64_t sdist = INT_MAX;
-
+  BLOCK_SIZE subsize;
+  int this_rate, sum_rate = 0, best_rate = INT_MAX;
+  int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX;
+  int64_t sum_rd = 0;
+  int do_split = bsize >= BLOCK_8X8;
+  int do_rect = 1;
+  // Override skipping rectangular partition operations for edge blocks
+  const int force_horz_split = (mi_row + ms >= cm->mi_rows);
+  const int force_vert_split = (mi_col + ms >= cm->mi_cols);
+
+  int partition_none_allowed = !force_horz_split && !force_vert_split;
+  int partition_horz_allowed = !force_vert_split && bsize >= BLOCK_8X8;
+  int partition_vert_allowed = !force_horz_split && bsize >= BLOCK_8X8;
+
+  int partition_split_done = 0;
   (void) *tp_orig;
 
-  if (bsize < BLOCK_SIZE_SB8X8) {
+  if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
     if (xd->ab_index != 0) {
@@ -1539,320 +1774,228 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
   }
   assert(mi_height_log2(bsize) == mi_width_log2(bsize));
 
-  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
-
-  // PARTITION_SPLIT
-  if (!cpi->sf.auto_min_max_partition_size ||
-      bsize >= cpi->sf.min_partition_size) {
-    if (bsize > BLOCK_SIZE_SB8X8) {
-      int r4 = 0;
-      int64_t d4 = 0, sum_rd = 0;
-      subsize = get_subsize(bsize, PARTITION_SPLIT);
-
-      for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
-        int x_idx = (i & 1) * (ms >> 1);
-        int y_idx = (i >> 1) * (ms >> 1);
-        int r = 0;
-        int64_t d = 0;
-
-        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
-          continue;
+  // Determine partition types in search according to the speed features.
+  // The threshold set here has to be of square block size.
+  if (cpi->sf.auto_min_max_partition_size) {
+    partition_none_allowed &= (bsize <= cpi->sf.max_partition_size &&
+                               bsize >= cpi->sf.min_partition_size);
+    partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size &&
+                                bsize >  cpi->sf.min_partition_size) ||
+                                force_horz_split);
+    partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size &&
+                                bsize >  cpi->sf.min_partition_size) ||
+                                force_vert_split);
+    do_split &= bsize > cpi->sf.min_partition_size;
+  }
+  if (cpi->sf.use_square_partition_only) {
+    partition_horz_allowed &= force_horz_split;
+    partition_vert_allowed &= force_vert_split;
+  }
 
-        *(get_sb_index(xd, subsize)) = i;
-        rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r,
-                          &d, i != 3, best_rd - sum_rd);
+  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-        if (r == INT_MAX) {
-          r4 = INT_MAX;
-          sum_rd = INT64_MAX;
-        } else {
-          r4 += r;
-          d4 += d;
-          sum_rd = RDCOST(x->rdmult, x->rddiv, r4, d4);
-        }
-      }
-      set_partition_seg_context(cm, xd, mi_row, mi_col);
-      pl = partition_plane_context(xd, bsize);
-      if (r4 != INT_MAX && i == 4) {
-        r4 += x->partition_cost[pl][PARTITION_SPLIT];
-        *(get_sb_partitioning(x, bsize)) = subsize;
-        assert(r4 >= 0);
-        assert(d4 >= 0);
-        srate = r4;
-        sdist = d4;
-        best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r4, d4));
-      }
-      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  if (cpi->sf.disable_split_var_thresh && partition_none_allowed) {
+    unsigned int source_variancey;
+    vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+    source_variancey = get_sby_perpixel_variance(cpi, x, bsize);
+    if (source_variancey < cpi->sf.disable_split_var_thresh) {
+      do_split = 0;
+      if (source_variancey < cpi->sf.disable_split_var_thresh / 2)
+        do_rect = 0;
     }
   }
 
-  // Use 4 subblocks' motion estimation results to speed up current
-  // partition's checking.
-  x->fast_ms = 0;
-  x->pred_mv.as_int = 0;
-  x->subblock_ref = 0;
-
-  if (cpi->sf.using_small_partition_info &&
-      (!cpi->sf.auto_min_max_partition_size ||
-      (bsize <= cpi->sf.max_partition_size &&
-      bsize >= cpi->sf.min_partition_size))) {
-    // Only use 8x8 result for non HD videos.
-    // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0;
-    int use_8x8 = 1;
-
-    if (cm->frame_type && !cpi->is_src_frame_alt_ref &&
-        ((use_8x8 && bsize == BLOCK_16X16) ||
-        bsize == BLOCK_32X32 || bsize == BLOCK_64X64)) {
-      int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0;
-      PICK_MODE_CONTEXT *block_context = NULL;
-
-      if (bsize == BLOCK_16X16) {
-        block_context = x->sb8x8_context[xd->sb_index][xd->mb_index];
-      } else if (bsize == BLOCK_32X32) {
-        block_context = x->mb_context[xd->sb_index];
-      } else if (bsize == BLOCK_SIZE_SB64X64) {
-        block_context = x->sb32_context;
-      }
-
-      if (block_context) {
-        ref0 = block_context[0].mic.mbmi.ref_frame[0];
-        ref1 = block_context[1].mic.mbmi.ref_frame[0];
-        ref2 = block_context[2].mic.mbmi.ref_frame[0];
-        ref3 = block_context[3].mic.mbmi.ref_frame[0];
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    pick_sb_modes(cpi, mi_row, mi_col, &this_rate, &this_dist, bsize,
+                  get_block_context(x, bsize), best_rd);
+    if (this_rate != INT_MAX) {
+      if (bsize >= BLOCK_8X8) {
+        set_partition_seg_context(cm, xd, mi_row, mi_col);
+        pl = partition_plane_context(xd, bsize);
+        this_rate += x->partition_cost[pl][PARTITION_NONE];
       }
+      sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
+      if (sum_rd < best_rd) {
+        int64_t stop_thresh = 2048;
+
+        best_rate = this_rate;
+        best_dist = this_dist;
+        best_rd = sum_rd;
+        if (bsize >= BLOCK_8X8)
+          *(get_sb_partitioning(x, bsize)) = bsize;
 
-      // Currently, only consider 4 inter ref frames.
-      if (ref0 && ref1 && ref2 && ref3) {
-        int16_t mvr0 = 0, mvc0 = 0, mvr1 = 0, mvc1 = 0, mvr2 = 0, mvc2 = 0,
-            mvr3 = 0, mvc3 = 0;
-        int d01, d23, d02, d13;  // motion vector distance between 2 blocks
-
-        // Get each subblock's motion vectors.
-        mvr0 = block_context[0].mic.mbmi.mv[0].as_mv.row;
-        mvc0 = block_context[0].mic.mbmi.mv[0].as_mv.col;
-        mvr1 = block_context[1].mic.mbmi.mv[0].as_mv.row;
-        mvc1 = block_context[1].mic.mbmi.mv[0].as_mv.col;
-        mvr2 = block_context[2].mic.mbmi.mv[0].as_mv.row;
-        mvc2 = block_context[2].mic.mbmi.mv[0].as_mv.col;
-        mvr3 = block_context[3].mic.mbmi.mv[0].as_mv.row;
-        mvc3 = block_context[3].mic.mbmi.mv[0].as_mv.col;
-
-        // Adjust sign if ref is alt_ref
-        if (cm->ref_frame_sign_bias[ref0]) {
-          mvr0 *= -1;
-          mvc0 *= -1;
-        }
-
-        if (cm->ref_frame_sign_bias[ref1]) {
-          mvr1 *= -1;
-          mvc1 *= -1;
-        }
-
-        if (cm->ref_frame_sign_bias[ref2]) {
-          mvr2 *= -1;
-          mvc2 *= -1;
-        }
+        // Adjust threshold according to partition size.
+        stop_thresh >>= 8 - (b_width_log2_lookup[bsize] +
+            b_height_log2_lookup[bsize]);
 
-        if (cm->ref_frame_sign_bias[ref3]) {
-          mvr3 *= -1;
-          mvc3 *= -1;
+        // If obtained distortion is very small, choose current partition
+        // and stop splitting.
+        if (this_dist < stop_thresh) {
+          do_split = 0;
+          do_rect = 0;
         }
+      }
+    }
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
 
-        // Calculate mv distances.
-        d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1));
-        d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3));
-        d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2));
-        d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3));
-
-        if (d01 < 24 && d23 < 24 && d02 < 24 && d13 < 24) {
-          // Set fast motion search level.
-          x->fast_ms = 1;
+  // store estimated motion vector
+  if (cpi->sf.adaptive_motion_search)
+    store_pred_mv(x, get_block_context(x, bsize));
 
-          // Calculate prediction MV
-          x->pred_mv.as_mv.row = (mvr0 + mvr1 + mvr2 + mvr3) >> 2;
-          x->pred_mv.as_mv.col = (mvc0 + mvc1 + mvc2 + mvc3) >> 2;
+  // PARTITION_SPLIT
+  sum_rd = 0;
+  // TODO(jingning): use the motion vectors given by the above search as
+  // the starting point of motion search in the following partition type check.
+  if (do_split) {
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
+    for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+      const int x_idx = (i & 1) * ms;
+      const int y_idx = (i >> 1) * ms;
+
+      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+        continue;
 
-          if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 &&
-              d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) {
-            // Set fast motion search level.
-            x->fast_ms = 2;
+      *get_sb_index(xd, subsize) = i;
+      if (cpi->sf.adaptive_motion_search)
+        load_pred_mv(x, get_block_context(x, bsize));
+      rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+                        &this_rate, &this_dist, i != 3, best_rd - sum_rd);
 
-            if (!d01 && !d23 && !d02 && !d13) {
-              x->fast_ms = 3;
-              x->subblock_ref = ref0;
-            }
-          }
-        }
+      if (this_rate == INT_MAX) {
+        sum_rd = INT64_MAX;
+      } else {
+        sum_rate += this_rate;
+        sum_dist += this_dist;
+        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       }
     }
-  }
-
-  if (!cpi->sf.auto_min_max_partition_size ||
-      bsize <= cpi->sf.max_partition_size) {
-    int larger_is_better = 0;
-    // PARTITION_NONE
-    if ((mi_row + (ms >> 1) < cm->mi_rows) &&
-        (mi_col + (ms >> 1) < cm->mi_cols)) {
-      int r;
-      int64_t d;
-      pick_sb_modes(cpi, mi_row, mi_col, &r, &d, bsize,
-                    get_block_context(x, bsize), best_rd);
-      if (r != INT_MAX && bsize >= BLOCK_SIZE_SB8X8) {
-        set_partition_seg_context(cm, xd, mi_row, mi_col);
-        pl = partition_plane_context(xd, bsize);
-        r += x->partition_cost[pl][PARTITION_NONE];
-      }
-
-      if (r != INT_MAX &&
-          (bsize == BLOCK_SIZE_SB8X8 ||
-           RDCOST(x->rdmult, x->rddiv, r, d) <
-               RDCOST(x->rdmult, x->rddiv, srate, sdist))) {
-        best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r, d));
-        srate = r;
-        sdist = d;
-        larger_is_better = 1;
-        if (bsize >= BLOCK_SIZE_SB8X8)
-          *(get_sb_partitioning(x, bsize)) = bsize;
+    if (sum_rd < best_rd && i == 4) {
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      sum_rate += x->partition_cost[pl][PARTITION_SPLIT];
+      sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      if (sum_rd < best_rd) {
+        best_rate = sum_rate;
+        best_dist = sum_dist;
+        best_rd = sum_rd;
+        *(get_sb_partitioning(x, bsize)) = subsize;
+      } else {
+        // skip rectangular partition test when larger block size
+        // gives better rd cost
+        if (cpi->sf.less_rectangular_check)
+          do_rect &= !partition_none_allowed;
       }
     }
+    partition_split_done = 1;
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
 
-    if (bsize == BLOCK_SIZE_SB8X8) {
-      int r4 = 0;
-      int64_t d4 = 0, sum_rd = 0;
-      subsize = get_subsize(bsize, PARTITION_SPLIT);
-
-      for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
-        int x_idx = (i & 1) * (ms >> 1);
-        int y_idx = (i >> 1) * (ms >> 1);
-        int r = 0;
-        int64_t d = 0;
-
-        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
-          continue;
-
-        *(get_sb_index(xd, subsize)) = i;
-        rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r,
-                          &d, i != 3, best_rd - sum_rd);
+  x->fast_ms = 0;
+  x->subblock_ref = 0;
 
-        if (r == INT_MAX) {
-          r4 = INT_MAX;
-          sum_rd = INT64_MAX;
-        } else {
-          r4 += r;
-          d4 += d;
-          sum_rd = RDCOST(x->rdmult, x->rddiv, r4, d4);
-        }
+  if (partition_split_done &&
+      cpi->sf.using_small_partition_info) {
+    compute_fast_motion_search_level(cpi, bsize);
+  }
+
+  // PARTITION_HORZ
+  if (partition_horz_allowed && do_rect) {
+    subsize = get_subsize(bsize, PARTITION_HORZ);
+    *get_sb_index(xd, subsize) = 0;
+    if (cpi->sf.adaptive_motion_search)
+      load_pred_mv(x, get_block_context(x, bsize));
+    pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+                  get_block_context(x, subsize), best_rd);
+    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+
+    if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
+      update_state(cpi, get_block_context(x, subsize), subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+
+      *get_sb_index(xd, subsize) = 1;
+      if (cpi->sf.adaptive_motion_search)
+        load_pred_mv(x, get_block_context(x, bsize));
+      pick_sb_modes(cpi, mi_row + ms, mi_col, &this_rate,
+                    &this_dist, subsize, get_block_context(x, subsize),
+                    best_rd - sum_rd);
+      if (this_rate == INT_MAX) {
+        sum_rd = INT64_MAX;
+      } else {
+        sum_rate += this_rate;
+        sum_dist += this_dist;
+        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       }
+    }
+    if (sum_rd < best_rd) {
       set_partition_seg_context(cm, xd, mi_row, mi_col);
       pl = partition_plane_context(xd, bsize);
-      if (r4 != INT_MAX && i == 4) {
-        r4 += x->partition_cost[pl][PARTITION_SPLIT];
-        if (RDCOST(x->rdmult, x->rddiv, r4, d4) <
-            RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
-          srate = r4;
-          sdist = d4;
-          larger_is_better = 0;
-          *(get_sb_partitioning(x, bsize)) = subsize;
-          best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r4, d4));
-        }
+      sum_rate += x->partition_cost[pl][PARTITION_HORZ];
+      sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      if (sum_rd < best_rd) {
+        best_rd = sum_rd;
+        best_rate = sum_rate;
+        best_dist = sum_dist;
+        *(get_sb_partitioning(x, bsize)) = subsize;
       }
-      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
     }
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
 
-    if (!cpi->sf.use_square_partition_only &&
-        (!cpi->sf.less_rectangular_check ||!larger_is_better)) {
-      // PARTITION_HORZ
-      if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
-        int r2, r = 0;
-        int64_t d2, d = 0, h_rd;
-        subsize = get_subsize(bsize, PARTITION_HORZ);
-        *(get_sb_index(xd, subsize)) = 0;
-        pick_sb_modes(cpi, mi_row, mi_col, &r2, &d2, subsize,
-                      get_block_context(x, subsize), best_rd);
-        h_rd = RDCOST(x->rdmult, x->rddiv, r2, d2);
-
-        if (r2 != INT_MAX && h_rd < best_rd &&
-            mi_row + (ms >> 1) < cm->mi_rows) {
-          update_state(cpi, get_block_context(x, subsize), subsize, 0);
-          encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
-          *(get_sb_index(xd, subsize)) = 1;
-          pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &r, &d, subsize,
-                        get_block_context(x, subsize), best_rd - h_rd);
-          if (r == INT_MAX) {
-            r2 = INT_MAX;
-          } else {
-            r2 += r;
-            d2 += d;
-          }
-        }
-        set_partition_seg_context(cm, xd, mi_row, mi_col);
-        pl = partition_plane_context(xd, bsize);
-        if (r2 < INT_MAX)
-          r2 += x->partition_cost[pl][PARTITION_HORZ];
-        if (r2 != INT_MAX && RDCOST(x->rdmult, x->rddiv, r2, d2)
-            < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
-          best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r2, d2));
-          srate = r2;
-          sdist = d2;
-          *(get_sb_partitioning(x, bsize)) = subsize;
-        }
-        restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  // PARTITION_VERT
+  if (partition_vert_allowed && do_rect) {
+    subsize = get_subsize(bsize, PARTITION_VERT);
+
+    *get_sb_index(xd, subsize) = 0;
+    if (cpi->sf.adaptive_motion_search)
+      load_pred_mv(x, get_block_context(x, bsize));
+    pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+                  get_block_context(x, subsize), best_rd);
+    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+    if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
+      update_state(cpi, get_block_context(x, subsize), subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+
+      *get_sb_index(xd, subsize) = 1;
+      if (cpi->sf.adaptive_motion_search)
+        load_pred_mv(x, get_block_context(x, bsize));
+      pick_sb_modes(cpi, mi_row, mi_col + ms, &this_rate,
+                    &this_dist, subsize, get_block_context(x, subsize),
+                    best_rd - sum_rd);
+      if (this_rate == INT_MAX) {
+        sum_rd = INT64_MAX;
+      } else {
+        sum_rate += this_rate;
+        sum_dist += this_dist;
+        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       }
-
-      // PARTITION_VERT
-      if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
-        int r2;
-        int64_t d2, v_rd;
-        subsize = get_subsize(bsize, PARTITION_VERT);
-        *(get_sb_index(xd, subsize)) = 0;
-        pick_sb_modes(cpi, mi_row, mi_col, &r2, &d2, subsize,
-                      get_block_context(x, subsize), best_rd);
-        v_rd = RDCOST(x->rdmult, x->rddiv, r2, d2);
-        if (r2 != INT_MAX && v_rd < best_rd &&
-            mi_col + (ms >> 1) < cm->mi_cols) {
-          int r = 0;
-          int64_t d = 0;
-          update_state(cpi, get_block_context(x, subsize), subsize, 0);
-          encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
-          *(get_sb_index(xd, subsize)) = 1;
-          pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &r, &d, subsize,
-                        get_block_context(x, subsize), best_rd - v_rd);
-          if (r == INT_MAX) {
-            r2 = INT_MAX;
-          } else {
-            r2 += r;
-            d2 += d;
-          }
-        }
-        set_partition_seg_context(cm, xd, mi_row, mi_col);
-        pl = partition_plane_context(xd, bsize);
-        if (r2 < INT_MAX)
-          r2 += x->partition_cost[pl][PARTITION_VERT];
-        if (r2 != INT_MAX &&
-            RDCOST(x->rdmult, x->rddiv, r2, d2)
-            < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
-          srate = r2;
-          sdist = d2;
-          *(get_sb_partitioning(x, bsize)) = subsize;
-        }
-        restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+    }
+    if (sum_rd < best_rd) {
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      sum_rate += x->partition_cost[pl][PARTITION_VERT];
+      sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      if (sum_rd < best_rd) {
+        best_rate = sum_rate;
+        best_dist = sum_dist;
+        best_rd = sum_rd;
+        *(get_sb_partitioning(x, bsize)) = subsize;
       }
     }
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   }
-  *rate = srate;
-  *dist = sdist;
 
-  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-  if (srate < INT_MAX && sdist < INT_MAX && do_recon)
-    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
+  *rate = best_rate;
+  *dist = best_dist;
 
-  if (bsize == BLOCK_SIZE_SB64X64) {
+  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon)
+    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+  if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp);
-    assert(srate < INT_MAX);
-    assert(sdist < INT_MAX);
+    assert(best_rate < INT_MAX);
+    assert(best_dist < INT_MAX);
   } else {
     assert(tp_orig == *tp);
   }
@@ -1863,7 +2006,7 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD * const xd = &x->e_mbd;
-  int bsl = b_width_log2(BLOCK_SIZE_SB64X64), bs = 1 << bsl;
+  int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl;
   int ms = bs / 2;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
@@ -1871,7 +2014,7 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
   int r;
   int64_t d;
 
-  save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64);
+  save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
 
   // Default is non mask (all reference frames allowed.
   cpi->ref_frame_mask = 0;
@@ -1880,17 +2023,17 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
   if ((mi_row + (ms >> 1) < cm->mi_rows) &&
       (mi_col + (ms >> 1) < cm->mi_cols)) {
     cpi->set_ref_frame_mask = 1;
-    pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_SIZE_SB64X64,
-                  get_block_context(x, BLOCK_SIZE_SB64X64), INT64_MAX);
+    pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_64X64,
+                  get_block_context(x, BLOCK_64X64), INT64_MAX);
     set_partition_seg_context(cm, xd, mi_row, mi_col);
-    pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
+    pl = partition_plane_context(xd, BLOCK_64X64);
     r += x->partition_cost[pl][PARTITION_NONE];
 
-    *(get_sb_partitioning(x, BLOCK_SIZE_SB64X64)) = BLOCK_SIZE_SB64X64;
+    *(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64;
     cpi->set_ref_frame_mask = 0;
   }
 
-  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64);
+  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
 }
 
 static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
@@ -1908,12 +2051,7 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
     int dummy_rate;
     int64_t dummy_dist;
 
-    // Initialize a mask of modes that we will not consider;
-    // cpi->unused_mode_skip_mask = 0x0000000AAE17F800 (test no golden)
-    if (cpi->common.frame_type == KEY_FRAME)
-      cpi->unused_mode_skip_mask = 0;
-    else
-      cpi->unused_mode_skip_mask = 0xFFFFFFFFFFFFFE00;
+    vpx_memset(cpi->mb.pred_mv, 0, sizeof(cpi->mb.pred_mv));
 
     if (cpi->sf.reference_masking)
       rd_pick_reference_frame(cpi, mi_row, mi_col);
@@ -1921,18 +2059,18 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
     if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning ||
         cpi->sf.use_one_partition_size_always ) {
       const int idx_str = cm->mode_info_stride * mi_row + mi_col;
-      MODE_INFO *m = cm->mi + idx_str;
-      MODE_INFO *p = cm->prev_mi + idx_str;
+      MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
+      MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
 
       cpi->mb.source_variance = UINT_MAX;
       if (cpi->sf.use_one_partition_size_always) {
-        set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
-        set_partitioning(cpi, m, cpi->sf.always_this_block_size);
-        rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+        set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
+        set_partitioning(cpi, mi_8x8, mi_row, mi_col);
+        rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
       } else if (cpi->sf.partition_by_variance) {
-        choose_partitioning(cpi, cm->mi, mi_row, mi_col);
-        rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+        choose_partitioning(cpi, cm->mi_grid_visible, mi_row, mi_col);
+        rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
       } else {
         if ((cpi->common.current_video_frame
@@ -1943,26 +2081,28 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
             || cpi->is_src_frame_alt_ref) {
           // If required set upper and lower partition size limits
           if (cpi->sf.auto_min_max_partition_size) {
-            rd_auto_partition_range(cpi,
+            set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
+            rd_auto_partition_range(cpi, mi_row, mi_col,
                                     &cpi->sf.min_partition_size,
                                     &cpi->sf.max_partition_size);
           }
-          rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+          rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64,
                             &dummy_rate, &dummy_dist, 1, INT64_MAX);
         } else {
-          copy_partitioning(cpi, m, p);
-          rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+          copy_partitioning(cpi, mi_8x8, prev_mi_8x8);
+          rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                            &dummy_rate, &dummy_dist, 1);
         }
       }
     } else {
       // If required set upper and lower partition size limits
       if (cpi->sf.auto_min_max_partition_size) {
-        rd_auto_partition_range(cpi, &cpi->sf.min_partition_size,
+        set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
+        rd_auto_partition_range(cpi, mi_row, mi_col,
+                                &cpi->sf.min_partition_size,
                                 &cpi->sf.max_partition_size);
       }
-
-      rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+      rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64,
                         &dummy_rate, &dummy_dist, 1, INT64_MAX);
     }
   }
@@ -1993,8 +2133,8 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
   setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
-  xd->mode_info_context->mbmi.mode = DC_PRED;
-  xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+  xd->this_mi->mbmi.mode = DC_PRED;
+  xd->this_mi->mbmi.uv_mode = DC_PRED;
 
   vp9_zero(cpi->y_mode_count)
   vp9_zero(cpi->y_uv_mode_count)
@@ -2023,7 +2163,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
     cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
     cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
     cpi->mb.optimize = 0;
-    cpi->mb.e_mbd.lf.filter_level = 0;
+    cpi->common.lf.filter_level = 0;
     cpi->zbin_mode_boost_enabled = 0;
     cpi->common.tx_mode = ONLY_4X4;
   } else {
@@ -2070,8 +2210,14 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   vp9_zero(cm->counts.switchable_interp);
   vp9_zero(cpi->txfm_stepdown_count);
 
-  xd->mode_info_context = cm->mi;
-  xd->prev_mode_info_context = cm->prev_mi;
+  xd->mi_8x8 = cm->mi_grid_visible;
+  // required for vp9_frame_init_quantizer
+  xd->this_mi =
+  xd->mi_8x8[0] = cm->mi;
+  xd->mic_stream_ptr = cm->mi;
+
+  xd->last_mi = cm->prev_mi;
+
 
   vp9_zero(cpi->NMVcount);
   vp9_zero(cpi->coef_counts);
@@ -2095,7 +2241,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     build_activity_map(cpi);
   }
 
-  // re-initencode frame context.
+  // Re-initialize encode frame context.
   init_encode_frame_mb_context(cpi);
 
   vp9_zero(cpi->rd_comp_pred_diff);
@@ -2164,10 +2310,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 }
 
 static int check_dual_ref_flags(VP9_COMP *cpi) {
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  int ref_flags = cpi->ref_frame_flags;
+  const int ref_flags = cpi->ref_frame_flags;
 
-  if (vp9_segfeature_active(&xd->seg, 1, SEG_LVL_REF_FRAME)) {
+  if (vp9_segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
     return 0;
   } else {
     return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
@@ -2175,12 +2320,12 @@ static int check_dual_ref_flags(VP9_COMP *cpi) {
   }
 }
 
-static int get_skip_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs) {
+static int get_skip_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs) {
   int x, y;
 
   for (y = 0; y < ymbs; y++) {
     for (x = 0; x < xmbs; x++) {
-      if (!mi[y * mis + x].mbmi.mb_skip_coeff)
+      if (!mi_8x8[y * mis + x]->mbmi.skip_coeff)
         return 0;
     }
   }
@@ -2188,85 +2333,75 @@ static int get_skip_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs) {
   return 1;
 }
 
-static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
-                          TX_SIZE txfm_size) {
+static void set_txfm_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs,
+                          TX_SIZE tx_size) {
   int x, y;
 
   for (y = 0; y < ymbs; y++) {
     for (x = 0; x < xmbs; x++)
-      mi[y * mis + x].mbmi.txfm_size = txfm_size;
+      mi_8x8[y * mis + x]->mbmi.tx_size = tx_size;
   }
 }
 
-static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi, int mis,
-                                   TX_SIZE txfm_max, int bw, int bh, int mi_row,
-                                   int mi_col, BLOCK_SIZE_TYPE bsize) {
+static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+                                   int mis, TX_SIZE max_tx_size, int bw, int bh,
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize) {
   VP9_COMMON * const cm = &cpi->common;
-  MB_MODE_INFO * const mbmi = &mi->mbmi;
+  MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  if (mbmi->txfm_size > txfm_max) {
-    MACROBLOCK * const x = &cpi->mb;
-    MACROBLOCKD * const xd = &x->e_mbd;
+  if (mbmi->tx_size > max_tx_size) {
     const int ymbs = MIN(bh, cm->mi_rows - mi_row);
     const int xmbs = MIN(bw, cm->mi_cols - mi_col);
 
-    xd->mode_info_context = mi;
-    assert(vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
-           get_skip_flag(mi, mis, ymbs, xmbs));
-    set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
+    assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
+           get_skip_flag(mi_8x8, mis, ymbs, xmbs));
+    set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size);
   }
 }
 
-static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
-                                    TX_SIZE txfm_max, int mi_row, int mi_col,
-                                    BLOCK_SIZE_TYPE bsize) {
+static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+                                    TX_SIZE max_tx_size, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize) {
   VP9_COMMON * const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
-  int bwl, bhl;
-  const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
+  int bw, bh;
+  const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  bwl = mi_width_log2(mi->mbmi.sb_type);
-  bhl = mi_height_log2(mi->mbmi.sb_type);
+  bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type];
+  bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
 
-  if (bwl == bsl && bhl == bsl) {
-    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl, mi_row,
+  if (bw == bs && bh == bs) {
+    reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, bs, bs, mi_row,
+                           mi_col, bsize);
+  } else if (bw == bs && bh < bs) {
+    reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, bs, hbs, mi_row,
                            mi_col, bsize);
-  } else if (bwl == bsl && bhl < bsl) {
-    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs, mi_row, mi_col,
-                           bsize);
-    reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs,
-                           mi_row + bs, mi_col, bsize);
-  } else if (bwl < bsl && bhl == bsl) {
-    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl, mi_row, mi_col,
-                           bsize);
-    reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl, mi_row,
-                           mi_col + bs, bsize);
+    reset_skip_txfm_size_b(cpi, mi_8x8 + hbs * mis, mis, max_tx_size, bs, hbs,
+                           mi_row + hbs, mi_col, bsize);
+  } else if (bw < bs && bh == bs) {
+    reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, hbs, bs, mi_row,
+                           mi_col, bsize);
+    reset_skip_txfm_size_b(cpi, mi_8x8 + hbs, mis, max_tx_size, hbs, bs, mi_row,
+                           mi_col + hbs, bsize);
+
   } else {
-    BLOCK_SIZE_TYPE subsize;
+    const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
     int n;
 
-    assert(bwl < bsl && bhl < bsl);
-    if (bsize == BLOCK_64X64) {
-      subsize = BLOCK_32X32;
-    } else if (bsize == BLOCK_32X32) {
-      subsize = BLOCK_16X16;
-    } else {
-      assert(bsize == BLOCK_16X16);
-      subsize = BLOCK_8X8;
-    }
+    assert(bw < bs && bh < bs);
 
     for (n = 0; n < 4; n++) {
-      const int y_idx = n >> 1, x_idx = n & 0x01;
+      const int mi_dc = hbs * (n & 1);
+      const int mi_dr = hbs * (n >> 1);
 
-      reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, txfm_max,
-                              mi_row + y_idx * bs, mi_col + x_idx * bs,
-                              subsize);
+      reset_skip_txfm_size_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc], max_tx_size,
+                              mi_row + mi_dr, mi_col + mi_dc, subsize);
     }
   }
 }
@@ -2275,13 +2410,14 @@ static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
   VP9_COMMON * const cm = &cpi->common;
   int mi_row, mi_col;
   const int mis = cm->mode_info_stride;
-  MODE_INFO *mi, *mi_ptr = cm->mi;
+//  MODE_INFO *mi, *mi_ptr = cm->mi;
+  MODE_INFO **mi_8x8, **mi_ptr = cm->mi_grid_visible;
 
   for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) {
-    mi = mi_ptr;
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi += 8) {
-      reset_skip_txfm_size_sb(cpi, mi, txfm_max, mi_row, mi_col,
-                              BLOCK_SIZE_SB64X64);
+    mi_8x8 = mi_ptr;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi_8x8 += 8) {
+      reset_skip_txfm_size_sb(cpi, mi_8x8, txfm_max, mi_row, mi_col,
+                              BLOCK_64X64);
     }
   }
 }
@@ -2334,7 +2470,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
   // decoder such that we allow compound where one of the 3 buffers has a
   // different sign bias and that buffer is then the fixed ref. However, this
   // requires further work in the rd loop. For now the only supported encoder
-  // side behaviour is where the ALT ref buffer has opposite sign bias to
+  // side behavior is where the ALT ref buffer has opposite sign bias to
   // the other two.
   if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
        == cm->ref_frame_sign_bias[GOLDEN_FRAME])
@@ -2387,27 +2523,26 @@ void vp9_encode_frame(VP9_COMP *cpi) {
         cpi->rd_filter_threshes[frame_type][1] >
             cpi->rd_filter_threshes[frame_type][2] &&
         cpi->rd_filter_threshes[frame_type][1] >
-            cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
-      filter_type = vp9_switchable_interp[1];
+            cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) {
+      filter_type = EIGHTTAP_SMOOTH;
     } else if (cpi->rd_filter_threshes[frame_type][2] >
             cpi->rd_filter_threshes[frame_type][0] &&
         cpi->rd_filter_threshes[frame_type][2] >
-            cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
-      filter_type = vp9_switchable_interp[2];
+            cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) {
+      filter_type = EIGHTTAP_SHARP;
     } else if (cpi->rd_filter_threshes[frame_type][0] >
-                  cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
-      filter_type = vp9_switchable_interp[0];
+                  cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) {
+      filter_type = EIGHTTAP;
     } else {
       filter_type = SWITCHABLE;
     }
 
-    /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */
-
     cpi->mb.e_mbd.lossless = 0;
     if (cpi->oxcf.lossless) {
       cpi->mb.e_mbd.lossless = 1;
     }
 
+    /* transform size selection (4x4, 8x8, 16x16 or select-per-mb) */
     select_tx_mode(cpi);
     cpi->common.comp_pred_mode = pred_type;
     cpi->common.mcomp_filter_type = filter_type;
@@ -2419,7 +2554,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
     }
 
-    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
       const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs;
       cpi->rd_filter_threshes[frame_type][i] =
           (cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
@@ -2495,29 +2630,22 @@ void vp9_encode_frame(VP9_COMP *cpi) {
 
 }
 
-static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
-  const MACROBLOCKD *xd = &x->e_mbd;
-  const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
-  const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;
-
-  ++cpi->y_uv_mode_count[m][uvm];
-  if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
-    const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
-    const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-    const int bsl = MIN(bwl, bhl);
-    ++cpi->y_mode_count[MIN(bsl, 3)][m];
-  } else {
+static void sum_intra_stats(VP9_COMP *cpi, const MODE_INFO *mi) {
+  const MB_PREDICTION_MODE y_mode = mi->mbmi.mode;
+  const MB_PREDICTION_MODE uv_mode = mi->mbmi.uv_mode;
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
+
+  ++cpi->y_uv_mode_count[y_mode][uv_mode];
+
+  if (bsize < BLOCK_8X8) {
     int idx, idy;
-    int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[
-      xd->mode_info_context->mbmi.sb_type];
-    int num_4x4_blocks_high = num_4x4_blocks_high_lookup[
-      xd->mode_info_context->mbmi.sb_type];
-    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
-      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
-        int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode;
-        ++cpi->y_mode_count[0][m];
-      }
-    }
+    const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+    const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high)
+      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide)
+        ++cpi->y_mode_count[0][mi->bmi[idy * 2 + idx].as_mode];
+  } else {
+    ++cpi->y_mode_count[size_group_lookup[bsize]][y_mode];
   }
 }
 
@@ -2541,19 +2669,19 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
     x->act_zbin_adj = 1 - (int) (((int64_t) a + (b >> 1)) / b);
 #endif
 }
-
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
-                              int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
+                              int mi_row, int mi_col, BLOCK_SIZE bsize) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD * const xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mode_info_context;
+  MODE_INFO **mi_8x8 = xd->mi_8x8;
+  MODE_INFO *mi = mi_8x8[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
   unsigned int segment_id = mbmi->segment_id;
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  x->rd_search = 0;
+  x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
   x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
                     xd->q_index < QIDX_SKIP_THRESH);
   if (x->skip_encode)
@@ -2582,7 +2710,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
             cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
           else
             cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+        } else if (mbmi->sb_type < BLOCK_8X8) {
           cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;
         } else {
           cpi->zbin_mode_boost = MV_ZBIN_BOOST;
@@ -2595,13 +2723,11 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
     vp9_update_zbin_extra(cpi, x);
   }
 
-  if (mbmi->ref_frame[0] == INTRA_FRAME) {
-    vp9_encode_intra_block_y(
-        cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
-    vp9_encode_intra_block_uv(
-        cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+  if (!is_inter_block(mbmi)) {
+    vp9_encode_intra_block_y(x, MAX(bsize, BLOCK_8X8));
+    vp9_encode_intra_block_uv(x, MAX(bsize, BLOCK_8X8));
     if (output_enabled)
-      sum_intra_stats(cpi, x);
+      sum_intra_stats(cpi, mi);
   } else {
     int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[0])];
     YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];
@@ -2619,44 +2745,37 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
                      &xd->scale_factor[1]);
 
 
-    vp9_build_inter_predictors_sb(
-        xd, mi_row, mi_col,
-        bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8 : bsize);
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
   }
 
-  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
-    vp9_tokenize_sb(cpi, t, !output_enabled,
-                    (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+  if (!is_inter_block(mbmi)) {
+    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else if (!x->skip) {
-    vp9_encode_sb(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
-    vp9_tokenize_sb(cpi, t, !output_enabled,
-                    (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+    vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
+    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else {
-    int mb_skip_context = xd->left_available ? (mi - 1)->mbmi.mb_skip_coeff : 0;
-    mb_skip_context += (mi - mis)->mbmi.mb_skip_coeff;
+    int mb_skip_context = xd->left_available ? mi_8x8[-1]->mbmi.skip_coeff : 0;
+    mb_skip_context += mi_8x8[-mis] ? mi_8x8[-mis]->mbmi.skip_coeff : 0;
 
-    mbmi->mb_skip_coeff = 1;
+    mbmi->skip_coeff = 1;
     if (output_enabled)
       cm->counts.mbskip[mb_skip_context][1]++;
-    vp9_reset_sb_tokens_context(
-        xd, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+    reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
   }
 
-  // copy skip flag on all mb_mode_info contexts in this SB
-  // if this was a skip at this txfm size
-  vp9_set_pred_flag_mbskip(cm, bsize, mi_row, mi_col, mi->mbmi.mb_skip_coeff);
-
   if (output_enabled) {
     if (cm->tx_mode == TX_MODE_SELECT &&
-        mbmi->sb_type >= BLOCK_SIZE_SB8X8  &&
+        mbmi->sb_type >= BLOCK_8X8  &&
         !(is_inter_block(mbmi) &&
-            (mbmi->mb_skip_coeff ||
-             vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)))) {
+            (mbmi->skip_coeff ||
+             vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
       const uint8_t context = vp9_get_pred_context_tx_size(xd);
-      update_tx_counts(bsize, context, mbmi->txfm_size, &cm->counts.tx);
+      update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx);
     } else {
       int x, y;
-      TX_SIZE sz = (cm->tx_mode == TX_MODE_SELECT) ? TX_32X32 : cm->tx_mode;
+      TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode];
+      assert(sizeof(tx_mode_to_biggest_tx_size) /
+             sizeof(tx_mode_to_biggest_tx_size[0]) == TX_MODES);
       // The new intra coding scheme requires no change of transform size
       if (is_inter_block(&mi->mbmi)) {
         if (sz == TX_32X32 && bsize < BLOCK_32X32)
@@ -2666,18 +2785,15 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
         if (sz == TX_8X8 && bsize < BLOCK_8X8)
           sz = TX_4X4;
       } else if (bsize >= BLOCK_8X8) {
-        sz = mbmi->txfm_size;
+        sz = mbmi->tx_size;
       } else {
         sz = TX_4X4;
       }
 
-      for (y = 0; y < mi_height; y++) {
-        for (x = 0; x < mi_width; x++) {
-          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) {
-            mi[mis * y + x].mbmi.txfm_size = sz;
-          }
-        }
-      }
+      for (y = 0; y < mi_height; y++)
+        for (x = 0; x < mi_width; x++)
+          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
+            mi_8x8[mis * y + x]->mbmi.tx_size = sz;
     }
   }
 }
diff --git a/libvpx/vp9/encoder/vp9_encodeintra.c b/libvpx/vp9/encoder/vp9_encodeintra.c
index edbd2d9..c5e5dff 100644
--- a/libvpx/vp9/encoder/vp9_encodeintra.c
+++ b/libvpx/vp9/encoder/vp9_encodeintra.c
@@ -15,14 +15,14 @@
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodeintra.h"
 
-int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  (void) cpi;
+int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) {
+  MB_MODE_INFO * mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
   x->skip_encode = 0;
   mbmi->mode = DC_PRED;
   mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->txfm_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ?
-                                     TX_16X16 : TX_8X8) : TX_4X4;
-  vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
+  mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
+                                                                 : TX_8X8)
+                                   : TX_4X4;
+  vp9_encode_intra_block_y(x, mbmi->sb_type);
   return vp9_get_mb_ss(x->plane[0].src_diff);
 }
diff --git a/libvpx/vp9/encoder/vp9_encodeintra.h b/libvpx/vp9/encoder/vp9_encodeintra.h
index 16ac59e..e217924 100644
--- a/libvpx/vp9/encoder/vp9_encodeintra.h
+++ b/libvpx/vp9/encoder/vp9_encodeintra.h
@@ -13,12 +13,8 @@
 
 #include "vp9/encoder/vp9_onyx_int.h"
 
-int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
-void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                               int ss_txfrm_size, void *arg);
-void vp9_encode_intra_block_y(VP9_COMMON *const cm, MACROBLOCK *mb,
-                              BLOCK_SIZE_TYPE bs);
-void vp9_encode_intra_block_uv(VP9_COMMON *const cm, MACROBLOCK *mb,
-                               BLOCK_SIZE_TYPE bs);
+int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred);
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, void *arg);
 
 #endif  // VP9_ENCODER_VP9_ENCODEINTRA_H_
diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index 40b0a4e..8dd80a5 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c
@@ -69,7 +69,7 @@ static void inverse_transform_b_16x16_add(int eob,
     vp9_short_idct16x16_add(dqcoeff, dest, stride);
 }
 
-static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
+static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -81,18 +81,18 @@ static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
                      pd->dst.buf, pd->dst.stride);
 }
 
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
   subtract_plane(x, bsize, 0);
 }
 
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) {
   int i;
 
   for (i = 1; i < MAX_MB_PLANE; i++)
     subtract_plane(x, bsize, i);
 }
 
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   vp9_subtract_sby(x, bsize);
   vp9_subtract_sbuv(x, bsize);
 }
@@ -142,37 +142,36 @@ static int trellis_get_coeff_context(const int16_t *scan,
 }
 
 static void optimize_b(MACROBLOCK *mb,
-                       int plane, int block, BLOCK_SIZE_TYPE bsize,
+                       int plane, int block, BLOCK_SIZE plane_bsize,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                        TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  const int ref = is_inter_block(&xd->mode_info_context->mbmi);
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const int ref = is_inter_block(&xd->this_mi->mbmi);
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
-  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff,
-                                          block, 16);
+  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block);
   int16_t *qcoeff_ptr;
   int16_t *dqcoeff_ptr;
-  int eob = xd->plane[plane].eobs[block], final_eob, sz = 0;
+  int eob = pd->eobs[block], final_eob, sz = 0;
   const int i0 = 0;
   int rc, x, next, i;
   int64_t rdmult, rddiv, rd_cost0, rd_cost1;
   int rate0, rate1, error0, error1, t0, t1;
   int best, band, pt;
-  PLANE_TYPE type = xd->plane[plane].plane_type;
+  PLANE_TYPE type = pd->plane_type;
   int err_mult = plane_rd_mult[type];
   int default_eob;
   const int16_t *scan, *nb;
   const int mul = 1 + (tx_size == TX_32X32);
   uint8_t token_cache[1024];
-  const int ib = txfrm_block_to_raster_block(xd, bsize, plane,
-                                             block, 2 * tx_size);
-  const int16_t *dequant_ptr = xd->plane[plane].dequant;
+  const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block);
+  const int16_t *dequant_ptr = pd->dequant;
   const uint8_t * band_translate;
 
   assert((!type && !plane) || (type && plane));
-  dqcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16);
-  qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
+  dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
+  qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
   switch (tx_size) {
     default:
     case TX_4X4:
@@ -200,7 +199,7 @@ static void optimize_b(MACROBLOCK *mb,
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
   rdmult = mb->rdmult * err_mult;
-  if (mb->e_mbd.mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
+  if (mb->e_mbd.mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME)
     rdmult = (rdmult * 9) >> 4;
   rddiv = mb->rddiv;
   /* Initialize the sentinel node of the trellis. */
@@ -371,59 +370,48 @@ static void optimize_b(MACROBLOCK *mb,
   *a = *l = (final_eob > 0);
 }
 
-void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                    int ss_txfrm_size, MACROBLOCK *mb,
-                    struct optimize_ctx *ctx) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
+void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
+                    TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) {
   int x, y;
-
-  // find current entropy context
-  txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
-
-  optimize_b(mb, plane, block, bsize,
-             &ctx->ta[plane][x], &ctx->tl[plane][y], ss_txfrm_size / 2);
-}
-
-static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                           int ss_txfrm_size, void *arg) {
-  const struct encode_b_args* const args = arg;
-  vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->x, args->ctx);
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+  optimize_b(mb, plane, block, plane_bsize,
+             &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size);
 }
 
-void optimize_init_b(int plane, BLOCK_SIZE_TYPE bsize, void *arg) {
-  const struct encode_b_args* const args = arg;
+static void optimize_init_b(int plane, BLOCK_SIZE bsize,
+                            struct encode_b_args *args) {
   const MACROBLOCKD *xd = &args->x->e_mbd;
   const struct macroblockd_plane* const pd = &xd->plane[plane];
-  const int bwl = b_width_log2(bsize) - pd->subsampling_x;
-  const int bhl = b_height_log2(bsize) - pd->subsampling_y;
-  const int bw = 1 << bwl, bh = 1 << bhl;
-  const MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->txfm_size;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
   int i;
 
   switch (tx_size) {
     case TX_4X4:
       vpx_memcpy(args->ctx->ta[plane], pd->above_context,
-                 sizeof(ENTROPY_CONTEXT) * bw);
+                 sizeof(ENTROPY_CONTEXT) * num_4x4_w);
       vpx_memcpy(args->ctx->tl[plane], pd->left_context,
-                 sizeof(ENTROPY_CONTEXT) * bh);
+                 sizeof(ENTROPY_CONTEXT) * num_4x4_h);
       break;
     case TX_8X8:
-      for (i = 0; i < bw; i += 2)
+      for (i = 0; i < num_4x4_w; i += 2)
         args->ctx->ta[plane][i] = !!*(uint16_t *)&pd->above_context[i];
-      for (i = 0; i < bh; i += 2)
+      for (i = 0; i < num_4x4_h; i += 2)
         args->ctx->tl[plane][i] = !!*(uint16_t *)&pd->left_context[i];
       break;
     case TX_16X16:
-      for (i = 0; i < bw; i += 4)
+      for (i = 0; i < num_4x4_w; i += 4)
         args->ctx->ta[plane][i] = !!*(uint32_t *)&pd->above_context[i];
-      for (i = 0; i < bh; i += 4)
+      for (i = 0; i < num_4x4_h; i += 4)
         args->ctx->tl[plane][i] = !!*(uint32_t *)&pd->left_context[i];
       break;
     case TX_32X32:
-      for (i = 0; i < bw; i += 8)
+      for (i = 0; i < num_4x4_w; i += 8)
         args->ctx->ta[plane][i] = !!*(uint64_t *)&pd->above_context[i];
-      for (i = 0; i < bh; i += 8)
+      for (i = 0; i < num_4x4_h; i += 8)
         args->ctx->tl[plane][i] = !!*(uint64_t *)&pd->left_context[i];
       break;
     default:
@@ -431,38 +419,19 @@ void optimize_init_b(int plane, BLOCK_SIZE_TYPE bsize, void *arg) {
   }
 }
 
-void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
-  optimize_init_b(0, bsize, &arg);
-  foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, optimize_block, &arg);
-}
-
-void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
-                       BLOCK_SIZE_TYPE bsize) {
-  struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
-  int i;
-  for (i = 1; i < MAX_MB_PLANE; ++i)
-    optimize_init_b(i, bsize, &arg);
-
-  foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg);
-}
-
-void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                 int ss_txfrm_size, void *arg) {
+void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
+                     TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
   MACROBLOCKD* const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16);
-  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16);
-  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
-  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+  int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
+  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const int16_t *scan, *iscan;
   uint16_t *eob = &pd->eobs[block];
-  const int bwl = plane_block_width_log2by4(bsize, pd), bw = 1 << bwl;
+  const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
   const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
   int xoff, yoff;
   int16_t *src_diff;
@@ -475,7 +444,7 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
       xoff = 32 * (block & twmask);
       yoff = 32 * (block >> twl);
       src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      if (x->rd_search)
+      if (x->use_lp32x32fdct)
         vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
       else
         vp9_short_fdct32x32(src_diff, coeff, bw * 8);
@@ -523,29 +492,27 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
   }
 }
 
-static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                         int ss_txfrm_size, void *arg) {
+static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
+                         TX_SIZE tx_size, void *arg) {
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
-                                                       block, ss_txfrm_size);
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
-  uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane,
-                                                 raster_block,
+  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
+                                                       block);
+
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
                                                  pd->dst.buf, pd->dst.stride);
-  xform_quant(plane, block, bsize, ss_txfrm_size, arg);
+  vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
 
   if (x->optimize)
-    vp9_optimize_b(plane, block, bsize, ss_txfrm_size, x, args->ctx);
+    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
 
-  if (x->skip_encode)
-    return;
-  if (pd->eobs[block] == 0)
+  if (x->skip_encode || pd->eobs[block] == 0)
     return;
 
-  switch (ss_txfrm_size / 2) {
+  switch (tx_size) {
     case TX_32X32:
       vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
       break;
@@ -564,28 +531,15 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
       inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
                                   dst, pd->dst.stride);
       break;
+    default:
+      assert(!"Invalid transform size");
   }
 }
 
-void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL};
-
-  foreach_transformed_block_in_plane(xd, bsize, 0, xform_quant, &arg);
-}
-
-void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL};
-
-  foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
-}
-
-void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  struct encode_b_args arg = {x, &ctx};
 
   vp9_subtract_sby(x, bsize);
   if (x->optimize)
@@ -594,25 +548,10 @@ void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
   foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
 }
 
-void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
-
-  vp9_subtract_sbuv(x, bsize);
-  if (x->optimize) {
-    int i;
-    for (i = 1; i < MAX_MB_PLANE; ++i)
-      optimize_init_b(i, bsize, &arg);
-  }
-
-  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
-}
-
-void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  struct encode_b_args arg = {x, &ctx};
 
   vp9_subtract_sb(x, bsize);
 
@@ -625,35 +564,32 @@ void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
   foreach_transformed_block(xd, bsize, encode_block, &arg);
 }
 
-void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                        int ss_txfrm_size, void *arg) {
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16);
-  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16);
-  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+  int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
+  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const int16_t *scan, *iscan;
   TX_TYPE tx_type;
   MB_PREDICTION_MODE mode;
-  const int bwl = b_width_log2(bsize) - pd->subsampling_x, bw = 1 << bwl;
+  const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
   const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
   int xoff, yoff;
   uint8_t *src, *dst;
   int16_t *src_diff;
   uint16_t *eob = &pd->eobs[block];
 
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    extend_for_intra(xd, plane, block, bsize, ss_txfrm_size);
-  }
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0)
+    extend_for_intra(xd, plane_bsize, plane, block, tx_size);
 
   // if (x->optimize)
-  // vp9_optimize_b(plane, block, bsize, ss_txfrm_size,
-  //                x, args->ctx);
+  // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
 
   switch (tx_size) {
     case TX_32X32:
@@ -670,7 +606,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
                               dst, pd->dst.stride, dst, pd->dst.stride);
       vp9_subtract_block(32, 32, src_diff, bw * 4,
                          src, p->src.stride, dst, pd->dst.stride);
-      if (x->rd_search)
+      if (x->use_lp32x32fdct)
         vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
       else
         vp9_short_fdct32x32(src_diff, coeff, bw * 8);
@@ -699,8 +635,8 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
         vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
       else
         x->fwd_txm16x16(src_diff, coeff, bw * 8);
-      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff,
+      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
       if (!x->skip_encode && *eob) {
         if (tx_type == DCT_DCT)
@@ -743,7 +679,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
       scan = get_scan_4x4(tx_type);
       iscan = get_iscan_4x4(tx_type);
       if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
-        mode = xd->mode_info_context->bmi[block].as_mode;
+        mode = xd->this_mi->bmi[block].as_mode;
       else
         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
 
@@ -778,20 +714,18 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
   }
 }
 
-void vp9_encode_intra_block_y(VP9_COMMON *cm, MACROBLOCK *x,
-                              BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  struct encode_b_args arg = {x, &ctx};
 
-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     encode_block_intra, &arg);
+  foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra,
+                                     &arg);
 }
-void vp9_encode_intra_block_uv(VP9_COMMON *cm, MACROBLOCK *x,
-                              BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
-  foreach_transformed_block_uv(xd, bsize, encode_block_intra, &arg);
+  struct encode_b_args arg = {x, &ctx};
+  foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg);
 }
 
diff --git a/libvpx/vp9/encoder/vp9_encodemb.h b/libvpx/vp9/encoder/vp9_encodemb.h
index f647fd9..54e69fd 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/libvpx/vp9/encoder/vp9_encodemb.h
@@ -16,8 +16,28 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
+typedef enum {
+  RD_DC_PRED = DC_PRED,
+  RD_V_PRED =  V_PRED,
+  RD_H_PRED = H_PRED,
+  RD_D45_PRED = D45_PRED,
+  RD_D135_PRED = D135_PRED,
+  RD_D117_PRED = D117_PRED,
+  RD_D153_PRED = D153_PRED,
+  RD_D207_PRED = D207_PRED,
+  RD_D63_PRED = D63_PRED,
+  RD_TM_PRED = TM_PRED,
+  RD_NEARESTMV = NEARESTMV,
+  RD_NEARMV = NEARMV,
+  RD_ZEROMV = ZEROMV,
+  RD_NEWMV = NEWMV,
+  RD_I4X4_PRED,
+  RD_SPLITMV,
+  RD_MODE_COUNT
+} RD_PREDICTION_MODE;
+
 typedef struct {
-  MB_PREDICTION_MODE mode;
+  RD_PREDICTION_MODE mode;
   MV_REFERENCE_FRAME ref_frame;
   MV_REFERENCE_FRAME second_ref_frame;
 } MODE_DEFINITION;
@@ -28,28 +48,22 @@ struct optimize_ctx {
 };
 
 struct encode_b_args {
-  VP9_COMMON *cm;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
 };
 
-void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                    int ss_txfrm_size, MACROBLOCK *x,
-                    struct optimize_ctx *ctx);
-void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
+
+void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
+                     TX_SIZE tx_size, void *arg);
 
-void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
 
-void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                 int ss_txfrm_size, void *arg);
-void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize);
 
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize);
 
 #endif  // VP9_ENCODER_VP9_ENCODEMB_H_
diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c
index 1c6fa3a..ed3a2bb 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/libvpx/vp9/encoder/vp9_encodemv.c
@@ -20,10 +20,6 @@
 extern unsigned int active_section;
 #endif
 
-#ifdef NMV_STATS
-nmv_context_counts tnmvcounts;
-#endif
-
 static void encode_mv_component(vp9_writer* w, int comp,
                                 const nmv_component* mvcomp, int usehp) {
   int offset;
@@ -159,7 +155,6 @@ static void counts_to_nmv_context(
     unsigned int (*branch_ct_class0_hp)[2],
     unsigned int (*branch_ct_hp)[2]) {
   int i, j, k;
-  vp9_counts_process(nmv_count, usehp);
   vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
                                    prob->joints,
                                    branch_ct_joint,
@@ -218,152 +213,6 @@ static void counts_to_nmv_context(
   }
 }
 
-#ifdef NMV_STATS
-void init_nmvstats() {
-  vp9_zero(tnmvcounts);
-}
-
-void print_nmvstats() {
-  nmv_context prob;
-  unsigned int branch_ct_joint[MV_JOINTS - 1][2];
-  unsigned int branch_ct_sign[2][2];
-  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
-  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
-  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
-  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
-  unsigned int branch_ct_fp[2][4 - 1][2];
-  unsigned int branch_ct_class0_hp[2][2];
-  unsigned int branch_ct_hp[2][2];
-  int i, j, k;
-  counts_to_nmv_context(&tnmvcounts, &prob, 1,
-                        branch_ct_joint, branch_ct_sign, branch_ct_classes,
-                        branch_ct_class0, branch_ct_bits,
-                        branch_ct_class0_fp, branch_ct_fp,
-                        branch_ct_class0_hp, branch_ct_hp);
-
-  printf("\nCounts =\n  { ");
-  for (j = 0; j < MV_JOINTS; ++j)
-    printf("%d, ", tnmvcounts.joints[j]);
-  printf("},\n");
-  for (i = 0; i < 2; ++i) {
-    printf("  {\n");
-    printf("    %d/%d,\n", tnmvcounts.comps[i].sign[0],
-                           tnmvcounts.comps[i].sign[1]);
-    printf("    { ");
-    for (j = 0; j < MV_CLASSES; ++j)
-      printf("%d, ", tnmvcounts.comps[i].classes[j]);
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < CLASS0_SIZE; ++j)
-      printf("%d, ", tnmvcounts.comps[i].class0[j]);
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < MV_OFFSET_BITS; ++j)
-      printf("%d/%d, ", tnmvcounts.comps[i].bits[j][0],
-                        tnmvcounts.comps[i].bits[j][1]);
-    printf("},\n");
-
-    printf("    {");
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      printf("{");
-      for (k = 0; k < 4; ++k)
-        printf("%d, ", tnmvcounts.comps[i].class0_fp[j][k]);
-      printf("}, ");
-    }
-    printf("},\n");
-
-    printf("    { ");
-    for (j = 0; j < 4; ++j)
-      printf("%d, ", tnmvcounts.comps[i].fp[j]);
-    printf("},\n");
-
-    printf("    %d/%d,\n",
-           tnmvcounts.comps[i].class0_hp[0],
-           tnmvcounts.comps[i].class0_hp[1]);
-    printf("    %d/%d,\n",
-           tnmvcounts.comps[i].hp[0],
-           tnmvcounts.comps[i].hp[1]);
-    printf("  },\n");
-  }
-
-  printf("\nProbs =\n  { ");
-  for (j = 0; j < MV_JOINTS - 1; ++j)
-    printf("%d, ", prob.joints[j]);
-  printf("},\n");
-  for (i=0; i< 2; ++i) {
-    printf("  {\n");
-    printf("    %d,\n", prob.comps[i].sign);
-    printf("    { ");
-    for (j = 0; j < MV_CLASSES - 1; ++j)
-      printf("%d, ", prob.comps[i].classes[j]);
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < CLASS0_SIZE - 1; ++j)
-      printf("%d, ", prob.comps[i].class0[j]);
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < MV_OFFSET_BITS; ++j)
-      printf("%d, ", prob.comps[i].bits[j]);
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      printf("{");
-      for (k = 0; k < 3; ++k)
-        printf("%d, ", prob.comps[i].class0_fp[j][k]);
-      printf("}, ");
-    }
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < 3; ++j)
-      printf("%d, ", prob.comps[i].fp[j]);
-    printf("},\n");
-
-    printf("    %d,\n", prob.comps[i].class0_hp);
-    printf("    %d,\n", prob.comps[i].hp);
-    printf("  },\n");
-  }
-}
-
-static void add_nmvcount(nmv_context_counts* const dst,
-                         const nmv_context_counts* const src) {
-  int i, j, k;
-  for (j = 0; j < MV_JOINTS; ++j) {
-    dst->joints[j] += src->joints[j];
-  }
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < MV_VALS; ++j) {
-      dst->comps[i].mvcount[j] += src->comps[i].mvcount[j];
-    }
-    dst->comps[i].sign[0] += src->comps[i].sign[0];
-    dst->comps[i].sign[1] += src->comps[i].sign[1];
-    for (j = 0; j < MV_CLASSES; ++j) {
-      dst->comps[i].classes[j] += src->comps[i].classes[j];
-    }
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      dst->comps[i].class0[j] += src->comps[i].class0[j];
-    }
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      dst->comps[i].bits[j][0] += src->comps[i].bits[j][0];
-      dst->comps[i].bits[j][1] += src->comps[i].bits[j][1];
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      for (k = 0; k < 4; ++k) {
-        dst->comps[i].class0_fp[j][k] += src->comps[i].class0_fp[j][k];
-      }
-    }
-    for (j = 0; j < 4; ++j) {
-      dst->comps[i].fp[j] += src->comps[i].fp[j];
-    }
-    dst->comps[i].class0_hp[0] += src->comps[i].class0_hp[0];
-    dst->comps[i].class0_hp[1] += src->comps[i].class0_hp[1];
-    dst->comps[i].hp[0] += src->comps[i].hp[0];
-    dst->comps[i].hp[1] += src->comps[i].hp[1];
-  }
-}
-#endif
-
 void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
   int i, j;
   nmv_context prob;
@@ -378,10 +227,6 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
   unsigned int branch_ct_hp[2][2];
   nmv_context *mvc = &cpi->common.fc.nmvc;
 
-#ifdef NMV_STATS
-  if (!cpi->dummy_packing)
-    add_nmvcount(&tnmvcounts, &cpi->NMVcount);
-#endif
   counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
                         branch_ct_joint, branch_ct_sign, branch_ct_classes,
                         branch_ct_class0, branch_ct_bits,
@@ -390,22 +235,22 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
 
   for (j = 0; j < MV_JOINTS - 1; ++j)
     update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j],
-              VP9_NMV_UPDATE_PROB);
+              NMV_UPDATE_PROB);
 
   for (i = 0; i < 2; ++i) {
     update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign,
-              prob.comps[i].sign, VP9_NMV_UPDATE_PROB);
+              prob.comps[i].sign, NMV_UPDATE_PROB);
     for (j = 0; j < MV_CLASSES - 1; ++j)
       update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
-                prob.comps[i].classes[j], VP9_NMV_UPDATE_PROB);
+                prob.comps[i].classes[j], NMV_UPDATE_PROB);
 
     for (j = 0; j < CLASS0_SIZE - 1; ++j)
       update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
-                prob.comps[i].class0[j], VP9_NMV_UPDATE_PROB);
+                prob.comps[i].class0[j], NMV_UPDATE_PROB);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
       update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
-                prob.comps[i].bits[j], VP9_NMV_UPDATE_PROB);
+                prob.comps[i].bits[j], NMV_UPDATE_PROB);
   }
 
   for (i = 0; i < 2; ++i) {
@@ -414,20 +259,20 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
       for (k = 0; k < 3; ++k)
         update_mv(bc, branch_ct_class0_fp[i][j][k],
                   &mvc->comps[i].class0_fp[j][k],
-                  prob.comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
+                  prob.comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
     }
 
     for (j = 0; j < 3; ++j)
       update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j],
-                prob.comps[i].fp[j], VP9_NMV_UPDATE_PROB);
+                prob.comps[i].fp[j], NMV_UPDATE_PROB);
   }
 
   if (usehp) {
     for (i = 0; i < 2; ++i) {
       update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
-                prob.comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
+                prob.comps[i].class0_hp, NMV_UPDATE_PROB);
       update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
-                prob.comps[i].hp, VP9_NMV_UPDATE_PROB);
+                prob.comps[i].hp, NMV_UPDATE_PROB);
     }
   }
 }
@@ -471,7 +316,7 @@ void vp9_build_nmv_cost_table(int *mvjoint,
 
 void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
                          int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
-  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  MODE_INFO *mi = x->e_mbd.mi_8x8[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   MV diff;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
@@ -488,7 +333,7 @@ void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
           diff.col = mi->bmi[i].as_mv[0].as_mv.col - best_ref_mv->as_mv.col;
           vp9_inc_mv(&diff, &cpi->NMVcount);
 
-          if (x->e_mbd.mode_info_context->mbmi.ref_frame[1] > INTRA_FRAME) {
+          if (mi->mbmi.ref_frame[1] > INTRA_FRAME) {
             diff.row = mi->bmi[i].as_mv[1].as_mv.row -
                          second_best_ref_mv->as_mv.row;
             diff.col = mi->bmi[i].as_mv[1].as_mv.col -
diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index 6ba2a4f..9cf7b83 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c
@@ -346,7 +346,7 @@ static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *r
   // Set up pointers for this macro block recon buffer
   xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;
 
-  switch (xd->mode_info_context->mbmi.sb_type) {
+  switch (xd->this_mi->mbmi.sb_type) {
     case BLOCK_8X8:
       vp9_mse8x8(x->plane[0].src.buf, x->plane[0].src.stride,
                  xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
@@ -385,7 +385,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   int n;
   vp9_variance_fn_ptr_t v_fn_ptr =
-      cpi->fn_ptr[xd->mode_info_context->mbmi.sb_type];
+      cpi->fn_ptr[xd->this_mi->mbmi.sb_type];
   int new_mv_mode_penalty = 256;
 
   int sr = 0;
@@ -402,7 +402,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   further_steps -= sr;
 
   // override the default variance function to use MSE
-  switch (xd->mode_info_context->mbmi.sb_type) {
+  switch (xd->this_mi->mbmi.sb_type) {
     case BLOCK_8X8:
       v_fn_ptr.vf = vp9_mse8x8;
       break;
@@ -505,8 +505,11 @@ void vp9_first_pass(VP9_COMP *cpi) {
   setup_dst_planes(xd, new_yv12, 0, 0);
 
   x->partition_info = x->pi;
-
-  xd->mode_info_context = cm->mi;
+  xd->mi_8x8 = cm->mi_grid_visible;
+  // required for vp9_frame_init_quantizer
+  xd->this_mi =
+  xd->mi_8x8[0] = cm->mi;
+  xd->mic_stream_ptr = cm->mi;
 
   setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
@@ -549,26 +552,26 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
       if (mb_col * 2 + 1 < cm->mi_cols) {
         if (mb_row * 2 + 1 < cm->mi_rows) {
-          xd->mode_info_context->mbmi.sb_type = BLOCK_16X16;
+          xd->this_mi->mbmi.sb_type = BLOCK_16X16;
         } else {
-          xd->mode_info_context->mbmi.sb_type = BLOCK_16X8;
+          xd->this_mi->mbmi.sb_type = BLOCK_16X8;
         }
       } else {
         if (mb_row * 2 + 1 < cm->mi_rows) {
-          xd->mode_info_context->mbmi.sb_type = BLOCK_8X16;
+          xd->this_mi->mbmi.sb_type = BLOCK_8X16;
         } else {
-          xd->mode_info_context->mbmi.sb_type = BLOCK_8X8;
+          xd->this_mi->mbmi.sb_type = BLOCK_8X8;
         }
       }
-      xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME;
+      xd->this_mi->mbmi.ref_frame[0] = INTRA_FRAME;
       set_mi_row_col(cm, xd,
                      mb_row << 1,
-                     1 << mi_height_log2(xd->mode_info_context->mbmi.sb_type),
+                     1 << mi_height_log2(xd->this_mi->mbmi.sb_type),
                      mb_col << 1,
-                     1 << mi_height_log2(xd->mode_info_context->mbmi.sb_type));
+                     1 << mi_height_log2(xd->this_mi->mbmi.sb_type));
 
       // do intra 16x16 prediction
-      this_error = vp9_encode_intra(cpi, x, use_dc_pred);
+      this_error = vp9_encode_intra(x, use_dc_pred);
 
       // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
       // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
@@ -661,13 +664,13 @@ void vp9_first_pass(VP9_COMP *cpi) {
           mv.as_mv.col <<= 3;
           this_error = motion_error;
           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
-          xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-          xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME;
-          xd->mode_info_context->mbmi.ref_frame[1] = NONE;
+          xd->this_mi->mbmi.tx_size = TX_4X4;
+          xd->this_mi->mbmi.ref_frame[0] = LAST_FRAME;
+          xd->this_mi->mbmi.ref_frame[1] = NONE;
           vp9_build_inter_predictors_sby(xd, mb_row << 1,
                                          mb_col << 1,
-                                         xd->mode_info_context->mbmi.sb_type);
-          vp9_encode_sby(cm, x, xd->mode_info_context->mbmi.sb_type);
+                                         xd->this_mi->mbmi.sb_type);
+          vp9_encode_sby(x, xd->this_mi->mbmi.sb_type);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
@@ -1092,7 +1095,6 @@ static int estimate_cq(VP9_COMP *cpi,
   return q;
 }
 
-
 extern void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
 void vp9_init_second_pass(VP9_COMP *cpi) {
@@ -1580,7 +1582,7 @@ void define_fixed_arf_period(VP9_COMP *cpi) {
 
 // Analyse and define a gf/arf group.
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  FIRSTPASS_STATS next_frame;
+  FIRSTPASS_STATS next_frame = { 0 };
   FIRSTPASS_STATS *start_pos;
   int i;
   double boost_score = 0.0;
@@ -1616,8 +1618,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   start_pos = cpi->twopass.stats_in;
 
-  vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
-
   // Load stats for the current frame.
   mod_frame_err = calculate_modified_err(cpi, this_frame);
 
@@ -1720,6 +1720,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     old_boost_score = boost_score;
   }
 
+  cpi->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
+
   // Don't allow a gf too near the next kf
   if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {
     while (i < cpi->twopass.frames_to_key) {
@@ -2081,63 +2083,71 @@ void vp9_second_pass(VP9_COMP *cpi) {
 
   vp9_clear_system_state();
 
-  // Special case code for first frame.
-  if (cpi->common.current_video_frame == 0) {
-    cpi->twopass.est_max_qcorrection_factor = 1.0;
-
-    // Set a cq_level in constrained quality mode.
-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
-                               (int)(cpi->twopass.bits_left / frames_left));
-
-      cpi->cq_target_quality = cpi->oxcf.cq_level;
-      if (est_cq > cpi->cq_target_quality)
-        cpi->cq_target_quality = est_cq;
-    }
+  if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+    cpi->active_worst_quality = cpi->oxcf.cq_level;
+  } else {
+    // Special case code for first frame.
+    if (cpi->common.current_video_frame == 0) {
+      int section_target_bandwidth =
+          (int)(cpi->twopass.bits_left / frames_left);
+      cpi->twopass.est_max_qcorrection_factor = 1.0;
+
+      // Set a cq_level in constrained quality mode.
+      if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+        int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
+                                 section_target_bandwidth);
+
+        cpi->cq_target_quality = cpi->oxcf.cq_level;
+        if (est_cq > cpi->cq_target_quality)
+          cpi->cq_target_quality = est_cq;
+      }
 
-    // guess at maxq needed in 2nd pass
-    cpi->twopass.maxq_max_limit = cpi->worst_quality;
-    cpi->twopass.maxq_min_limit = cpi->best_quality;
+      // guess at maxq needed in 2nd pass
+      cpi->twopass.maxq_max_limit = cpi->worst_quality;
+      cpi->twopass.maxq_min_limit = cpi->best_quality;
 
-    tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
-                           (int)(cpi->twopass.bits_left / frames_left));
+      tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
+                             section_target_bandwidth);
 
-    cpi->active_worst_quality = tmp_q;
-    cpi->ni_av_qi = tmp_q;
-    cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
+      cpi->active_worst_quality = tmp_q;
+      cpi->ni_av_qi = tmp_q;
+      cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
 
 #ifndef ONE_SHOT_Q_ESTIMATE
-    // Limit the maxq value returned subsequently.
-    // This increases the risk of overspend or underspend if the initial
-    // estimate for the clip is bad, but helps prevent excessive
-    // variation in Q, especially near the end of a clip
-    // where for example a small overspend may cause Q to crash
-    adjust_maxq_qrange(cpi);
+      // Limit the maxq value returned subsequently.
+      // This increases the risk of overspend or underspend if the initial
+      // estimate for the clip is bad, but helps prevent excessive
+      // variation in Q, especially near the end of a clip
+      // where for example a small overspend may cause Q to crash
+      adjust_maxq_qrange(cpi);
 #endif
-  }
+    }
 
 #ifndef ONE_SHOT_Q_ESTIMATE
-  // The last few frames of a clip almost always have to few or too many
-  // bits and for the sake of over exact rate control we dont want to make
-  // radical adjustments to the allowed quantizer range just to use up a
-  // few surplus bits or get beneath the target rate.
-  else if ((cpi->common.current_video_frame <
-            (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&
-           ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-            (unsigned int)cpi->twopass.total_stats.count)) {
-    if (frames_left < 1)
-      frames_left = 1;
-
-    tmp_q = estimate_max_q(
-              cpi,
-              &cpi->twopass.total_left_stats,
-              (int)(cpi->twopass.bits_left / frames_left));
-
-    // Make a damped adjustment to active max Q
-    cpi->active_worst_quality =
-      adjust_active_maxq(cpi->active_worst_quality, tmp_q);
-  }
+    // The last few frames of a clip almost always have to few or too many
+    // bits and for the sake of over exact rate control we dont want to make
+    // radical adjustments to the allowed quantizer range just to use up a
+    // few surplus bits or get beneath the target rate.
+    else if ((cpi->common.current_video_frame <
+              (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&
+             ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
+              (unsigned int)cpi->twopass.total_stats.count)) {
+      int section_target_bandwidth =
+          (int)(cpi->twopass.bits_left / frames_left);
+      if (frames_left < 1)
+        frames_left = 1;
+
+      tmp_q = estimate_max_q(
+          cpi,
+          &cpi->twopass.total_left_stats,
+          section_target_bandwidth);
+
+      // Make a damped adjustment to active max Q
+      cpi->active_worst_quality =
+          adjust_active_maxq(cpi->active_worst_quality, tmp_q);
+    }
 #endif
+  }
   vp9_zero(this_frame);
   if (EOF == input_stats(cpi, &this_frame))
     return;
@@ -2157,6 +2167,8 @@ void vp9_second_pass(VP9_COMP *cpi) {
     // Define next gf group and assign bits to it
     this_frame_copy = this_frame;
 
+    cpi->gf_zeromotion_pct = 0;
+
 #if CONFIG_MULTIPLE_ARF
     if (cpi->multi_arf_enabled) {
       define_fixed_arf_period(cpi);
@@ -2167,6 +2179,15 @@ void vp9_second_pass(VP9_COMP *cpi) {
     }
 #endif
 
+    if (cpi->gf_zeromotion_pct > 995) {
+      // As long as max_thresh for encode breakout is small enough, it is ok
+      // to enable it for no-show frame, i.e. set enable_encode_breakout to 2.
+      if (!cpi->common.show_frame)
+        cpi->enable_encode_breakout = 0;
+      else
+        cpi->enable_encode_breakout = 2;
+    }
+
     // If we are going to code an altref frame at the end of the group
     // and the current frame is not a key frame....
     // If the previous group used an arf this frame has already benefited
diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c
index 154d31a..5a671f2 100644
--- a/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -40,14 +40,15 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
       (cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2);
   step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
 
-  vp9_clamp_mv_min_max(x, ref_mv);
+  vp9_clamp_mv_min_max(x, &ref_mv->as_mv);
 
   ref_full.as_mv.col = ref_mv->as_mv.col >> 3;
   ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  best_err = vp9_hex_search(x, &ref_full, dst_mv, step_param, x->errorperbit,
-                            &v_fn_ptr, NULL, NULL, NULL, NULL, ref_mv);
+  best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit,
+                            0, &v_fn_ptr,
+                            0, ref_mv, dst_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -58,7 +59,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
         x,
         dst_mv, ref_mv,
         x->errorperbit, &v_fn_ptr,
-        NULL, NULL,
+        0, cpi->sf.subpel_iters_per_step, NULL, NULL,
         & distortion, &sse);
   }
 
@@ -144,7 +145,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi,
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     unsigned int err;
 
-    xd->mode_info_context->mbmi.mode = mode;
+    xd->this_mi->mbmi.mode = mode;
     vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
                             x->plane[0].src.buf, x->plane[0].src.stride,
                             xd->plane[0].dst.buf, xd->plane[0].dst.stride);
@@ -240,9 +241,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   int mb_col, mb_row, offset = 0;
   int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
   int_mv arf_top_mv, gld_top_mv;
-  MODE_INFO mi_local;
-
-  vp9_zero(mi_local);
+  MODE_INFO mi_local = { { 0 } };
 
   // Set up limit values for motion vectors to prevent them extending outside the UMV borders
   arf_top_mv.as_int = 0;
@@ -254,7 +253,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   xd->plane[0].dst.stride  = buf->y_stride;
   xd->plane[0].pre[0].stride  = buf->y_stride;
   xd->plane[1].dst.stride = buf->uv_stride;
-  xd->mode_info_context = &mi_local;
+  xd->this_mi = &mi_local;
   mi_local.mbmi.sb_type = BLOCK_16X16;
   mi_local.mbmi.ref_frame[0] = LAST_FRAME;
   mi_local.mbmi.ref_frame[1] = NONE;
@@ -308,7 +307,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
 static void separate_arf_mbs(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int mb_col, mb_row, offset, i;
-  int ncnt[4];
+  int ncnt[4] = { 0 };
   int n_frames = cpi->mbgraph_n_frames;
 
   int *arf_not_zz;
@@ -344,7 +343,6 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
     }
   }
 
-  vpx_memset(ncnt, 0, sizeof(ncnt));
   for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
        offset += cm->mb_cols, mb_row++) {
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c
index 88beee7..1360088 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libvpx/vp9/encoder/vp9_mcomp.c
@@ -8,28 +8,30 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <stdio.h>
 #include <limits.h>
 #include <math.h>
+#include <stdio.h>
 
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vpx_mem/vpx_mem.h"
 #include "./vpx_config.h"
+
+#include "vpx_mem/vpx_mem.h"
+
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_common.h"
 
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_mcomp.h"
+
 // #define NEW_DIAMOND_SEARCH
 
-void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
-  int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +
-                ((ref_mv->as_mv.col & 7) ? 1 : 0);
-  int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL +
-                ((ref_mv->as_mv.row & 7) ? 1 : 0);
-  int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL;
-  int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL;
+void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv) {
+  const int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
+  const int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
+  const int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+  const int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
 
-  /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */
+  // Get intersection of UMV window and valid MV window to reduce # of checks
+  // in diamond search.
   if (x->mv_col_min < col_min)
     x->mv_col_min = col_min;
   if (x->mv_col_max > col_max)
@@ -245,52 +247,112 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
     },                                                                   \
     v = INT_MAX;)
 
-int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
-                                             int_mv *bestmv, int_mv *ref_mv,
-                                             int error_per_bit,
-                                             const vp9_variance_fn_ptr_t *vfp,
-                                             int *mvjcost, int *mvcost[2],
-                                             int *distortion,
-                                             unsigned int *sse1) {
+#define FIRST_LEVEL_CHECKS                              \
+  {                                                     \
+    unsigned int left, right, up, down, diag;           \
+    CHECK_BETTER(left, tr, tc - hstep);                 \
+    CHECK_BETTER(right, tr, tc + hstep);                \
+    CHECK_BETTER(up, tr - hstep, tc);                   \
+    CHECK_BETTER(down, tr + hstep, tc);                 \
+    whichdir = (left < right ? 0 : 1) +                 \
+               (up < down ? 0 : 2);                     \
+    switch (whichdir) {                                 \
+      case 0:                                           \
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);     \
+        break;                                          \
+      case 1:                                           \
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);     \
+        break;                                          \
+      case 2:                                           \
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);     \
+        break;                                          \
+      case 3:                                           \
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);     \
+        break;                                          \
+    }                                                   \
+  }
+
+#define SECOND_LEVEL_CHECKS                             \
+  {                                                     \
+    int kr, kc;                                         \
+    unsigned int second;                                \
+    if (tr != br && tc != bc) {                         \
+      kr = br - tr;                                     \
+      kc = bc - tc;                                     \
+      CHECK_BETTER(second, tr + kr, tc + 2 * kc);       \
+      CHECK_BETTER(second, tr + 2 * kr, tc + kc);       \
+    } else if (tr == br && tc != bc) {                  \
+      kc = bc - tc;                                     \
+      CHECK_BETTER(second, tr + hstep, tc + 2 * kc);    \
+      CHECK_BETTER(second, tr - hstep, tc + 2 * kc);    \
+      switch (whichdir) {                               \
+        case 0:                                         \
+        case 1:                                         \
+          CHECK_BETTER(second, tr + hstep, tc + kc);    \
+          break;                                        \
+        case 2:                                         \
+        case 3:                                         \
+          CHECK_BETTER(second, tr - hstep, tc + kc);    \
+          break;                                        \
+      }                                                 \
+    } else if (tr != br && tc == bc) {                  \
+      kr = br - tr;                                     \
+      CHECK_BETTER(second, tr + 2 * kr, tc + hstep);    \
+      CHECK_BETTER(second, tr + 2 * kr, tc - hstep);    \
+      switch (whichdir) {                               \
+        case 0:                                         \
+        case 2:                                         \
+          CHECK_BETTER(second, tr + kr, tc + hstep);    \
+          break;                                        \
+        case 1:                                         \
+        case 3:                                         \
+          CHECK_BETTER(second, tr + kr, tc - hstep);    \
+          break;                                        \
+      }                                                 \
+    }                                                   \
+  }
+
+int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
+                                      int_mv *bestmv, int_mv *ref_mv,
+                                      int error_per_bit,
+                                      const vp9_variance_fn_ptr_t *vfp,
+                                      int forced_stop,
+                                      int iters_per_step,
+                                      int *mvjcost, int *mvcost[2],
+                                      int *distortion,
+                                      unsigned int *sse1) {
   uint8_t *z = x->plane[0].src.buf;
   int src_stride = x->plane[0].src.stride;
   MACROBLOCKD *xd = &x->e_mbd;
 
-  int rr, rc, br, bc, hstep;
-  int tr, tc;
   unsigned int besterr = INT_MAX;
-  unsigned int left, right, up, down, diag;
   unsigned int sse;
   unsigned int whichdir;
-  unsigned int halfiters = 4;
-  unsigned int quarteriters = 4;
-  unsigned int eighthiters = 4;
+  unsigned int halfiters = iters_per_step;
+  unsigned int quarteriters = iters_per_step;
+  unsigned int eighthiters = iters_per_step;
   int thismse;
-  int maxc, minc, maxr, minr;
-  int y_stride;
-  int offset;
 
   uint8_t *y = xd->plane[0].pre[0].buf +
                (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
                bestmv->as_mv.col;
 
-  y_stride = xd->plane[0].pre[0].stride;
-
-  rr = ref_mv->as_mv.row;
-  rc = ref_mv->as_mv.col;
-  br = bestmv->as_mv.row << 3;
-  bc = bestmv->as_mv.col << 3;
-  hstep = 4;
-  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1));
-  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1));
-  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1));
-  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1));
+  const int y_stride = xd->plane[0].pre[0].stride;
 
-  tr = br;
-  tc = bc;
+  int rr = ref_mv->as_mv.row;
+  int rc = ref_mv->as_mv.col;
+  int br = bestmv->as_mv.row << 3;
+  int bc = bestmv->as_mv.col << 3;
+  int hstep = 4;
+  const int minc = MAX(x->mv_col_min << 3, ref_mv->as_mv.col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max << 3, ref_mv->as_mv.col + MV_MAX);
+  const int minr = MAX(x->mv_row_min << 3, ref_mv->as_mv.row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max << 3, ref_mv->as_mv.row + MV_MAX);
 
+  int tr = br;
+  int tc = bc;
 
-  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+  const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
 
   // central mv
   bestmv->as_mv.row <<= 3;
@@ -303,105 +365,45 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
 
   // TODO: Each subsequent iteration checks at least one point in
   // common with the last iteration could be 2 ( if diag selected)
-  while (--halfiters) {
+  while (halfiters--) {
     // 1/2 pel
-    CHECK_BETTER(left, tr, tc - hstep);
-    CHECK_BETTER(right, tr, tc + hstep);
-    CHECK_BETTER(up, tr - hstep, tc);
-    CHECK_BETTER(down, tr + hstep, tc);
-
-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-    switch (whichdir) {
-      case 0:
-        CHECK_BETTER(diag, tr - hstep, tc - hstep);
-        break;
-      case 1:
-        CHECK_BETTER(diag, tr - hstep, tc + hstep);
-        break;
-      case 2:
-        CHECK_BETTER(diag, tr + hstep, tc - hstep);
-        break;
-      case 3:
-        CHECK_BETTER(diag, tr + hstep, tc + hstep);
-        break;
-    }
-
+    FIRST_LEVEL_CHECKS;
     // no reason to check the same one again.
     if (tr == br && tc == bc)
       break;
-
     tr = br;
     tc = bc;
   }
 
   // TODO: Each subsequent iteration checks at least one point in common with
   // the last iteration could be 2 ( if diag selected) 1/4 pel
-  hstep >>= 1;
-  while (--quarteriters) {
-    CHECK_BETTER(left, tr, tc - hstep);
-    CHECK_BETTER(right, tr, tc + hstep);
-    CHECK_BETTER(up, tr - hstep, tc);
-    CHECK_BETTER(down, tr + hstep, tc);
-
-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-    switch (whichdir) {
-      case 0:
-        CHECK_BETTER(diag, tr - hstep, tc - hstep);
-        break;
-      case 1:
-        CHECK_BETTER(diag, tr - hstep, tc + hstep);
-        break;
-      case 2:
-        CHECK_BETTER(diag, tr + hstep, tc - hstep);
-        break;
-      case 3:
-        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    while (quarteriters--) {
+      FIRST_LEVEL_CHECKS;
+      // no reason to check the same one again.
+      if (tr == br && tc == bc)
         break;
+      tr = br;
+      tc = bc;
     }
-
-    // no reason to check the same one again.
-    if (tr == br && tc == bc)
-      break;
-
-    tr = br;
-    tc = bc;
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv)) {
+  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
+      forced_stop == 0) {
     hstep >>= 1;
-    while (--eighthiters) {
-      CHECK_BETTER(left, tr, tc - hstep);
-      CHECK_BETTER(right, tr, tc + hstep);
-      CHECK_BETTER(up, tr - hstep, tc);
-      CHECK_BETTER(down, tr + hstep, tc);
-
-      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-      switch (whichdir) {
-        case 0:
-          CHECK_BETTER(diag, tr - hstep, tc - hstep);
-          break;
-        case 1:
-          CHECK_BETTER(diag, tr - hstep, tc + hstep);
-          break;
-        case 2:
-          CHECK_BETTER(diag, tr + hstep, tc - hstep);
-          break;
-        case 3:
-          CHECK_BETTER(diag, tr + hstep, tc + hstep);
-          break;
-      }
-
+    while (eighthiters--) {
+      FIRST_LEVEL_CHECKS;
       // no reason to check the same one again.
       if (tr == br && tc == bc)
         break;
-
       tr = br;
       tc = bc;
     }
   }
+
   bestmv->as_mv.row = br;
   bestmv->as_mv.col = bc;
 
@@ -412,39 +414,31 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
   return besterr;
 }
 
-#undef DIST
-/* returns subpixel variance error function */
-#define DIST(r, c) \
-    vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
-              z, src_stride, &sse, second_pred)
-
-int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
                                  int_mv *bestmv, int_mv *ref_mv,
                                  int error_per_bit,
                                  const vp9_variance_fn_ptr_t *vfp,
+                                 int forced_stop,
+                                 int iters_per_step,
                                  int *mvjcost, int *mvcost[2],
                                  int *distortion,
-                                 unsigned int *sse1,
-                                 const uint8_t *second_pred, int w, int h) {
+                                 unsigned int *sse1) {
   uint8_t *z = x->plane[0].src.buf;
   int src_stride = x->plane[0].src.stride;
   MACROBLOCKD *xd = &x->e_mbd;
-
   int rr, rc, br, bc, hstep;
   int tr, tc;
   unsigned int besterr = INT_MAX;
-  unsigned int left, right, up, down, diag;
   unsigned int sse;
   unsigned int whichdir;
-  unsigned int halfiters = 4;
-  unsigned int quarteriters = 4;
-  unsigned int eighthiters = 4;
   int thismse;
   int maxc, minc, maxr, minr;
   int y_stride;
   int offset;
+  unsigned int halfiters = iters_per_step;
+  unsigned int quarteriters = iters_per_step;
+  unsigned int eighthiters = iters_per_step;
 
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
   uint8_t *y = xd->plane[0].pre[0].buf +
                (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
                bestmv->as_mv.col;
@@ -456,19 +450,18 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
   br = bestmv->as_mv.row << 3;
   bc = bestmv->as_mv.col << 3;
   hstep = 4;
-  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
-             ((1 << MV_MAX_BITS) - 1));
-  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
-             ((1 << MV_MAX_BITS) - 1));
-  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
-             ((1 << MV_MAX_BITS) - 1));
-  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
-             ((1 << MV_MAX_BITS) - 1));
+  minc = MAX(x->mv_col_min << 3,
+             (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1));
+  maxc = MIN(x->mv_col_max << 3,
+             (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1));
+  minr = MAX(x->mv_row_min << 3,
+             (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1));
+  maxr = MIN(x->mv_row_max << 3,
+             (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1));
 
   tr = br;
   tc = bc;
 
-
   offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
 
   // central mv
@@ -476,114 +469,40 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
   bestmv->as_mv.col <<= 3;
 
   // calculate central point error
-  // TODO(yunqingwang): central pointer error was already calculated in full-
-  // pixel search, and can be passed in this function.
-  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
-  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  besterr = vfp->vf(y, y_stride, z, src_stride, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
-  // Each subsequent iteration checks at least one point in
-  // common with the last iteration could be 2 ( if diag selected)
-  while (--halfiters) {
-    // 1/2 pel
-    CHECK_BETTER(left, tr, tc - hstep);
-    CHECK_BETTER(right, tr, tc + hstep);
-    CHECK_BETTER(up, tr - hstep, tc);
-    CHECK_BETTER(down, tr + hstep, tc);
-
-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+  // 1/2 pel
+  FIRST_LEVEL_CHECKS;
+  if (halfiters > 1) {
+    SECOND_LEVEL_CHECKS;
+  }
+  tr = br;
+  tc = bc;
 
-    switch (whichdir) {
-      case 0:
-        CHECK_BETTER(diag, tr - hstep, tc - hstep);
-        break;
-      case 1:
-        CHECK_BETTER(diag, tr - hstep, tc + hstep);
-        break;
-      case 2:
-        CHECK_BETTER(diag, tr + hstep, tc - hstep);
-        break;
-      case 3:
-        CHECK_BETTER(diag, tr + hstep, tc + hstep);
-        break;
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
     }
-
-    // no reason to check the same one again.
-    if (tr == br && tc == bc)
-      break;
-
     tr = br;
     tc = bc;
   }
 
-  // Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
-  hstep >>= 1;
-  while (--quarteriters) {
-    CHECK_BETTER(left, tr, tc - hstep);
-    CHECK_BETTER(right, tr, tc + hstep);
-    CHECK_BETTER(up, tr - hstep, tc);
-    CHECK_BETTER(down, tr + hstep, tc);
-
-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-    switch (whichdir) {
-      case 0:
-        CHECK_BETTER(diag, tr - hstep, tc - hstep);
-        break;
-      case 1:
-        CHECK_BETTER(diag, tr - hstep, tc + hstep);
-        break;
-      case 2:
-        CHECK_BETTER(diag, tr + hstep, tc - hstep);
-        break;
-      case 3:
-        CHECK_BETTER(diag, tr + hstep, tc + hstep);
-        break;
+  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
+      forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
     }
-
-    // no reason to check the same one again.
-    if (tr == br && tc == bc)
-      break;
-
     tr = br;
     tc = bc;
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv)) {
-    hstep >>= 1;
-    while (--eighthiters) {
-      CHECK_BETTER(left, tr, tc - hstep);
-      CHECK_BETTER(right, tr, tc + hstep);
-      CHECK_BETTER(up, tr - hstep, tc);
-      CHECK_BETTER(down, tr + hstep, tc);
-
-      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-      switch (whichdir) {
-        case 0:
-          CHECK_BETTER(diag, tr - hstep, tc - hstep);
-          break;
-        case 1:
-          CHECK_BETTER(diag, tr - hstep, tc + hstep);
-          break;
-        case 2:
-          CHECK_BETTER(diag, tr + hstep, tc - hstep);
-          break;
-        case 3:
-          CHECK_BETTER(diag, tr + hstep, tc + hstep);
-          break;
-      }
-
-      // no reason to check the same one again.
-      if (tr == br && tc == bc)
-        break;
-
-      tr = br;
-      tc = bc;
-    }
-  }
   bestmv->as_mv.row = br;
   bestmv->as_mv.col = bc;
 
@@ -594,636 +513,236 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
   return besterr;
 }
 
-
-#undef MVC
-#undef PRE
 #undef DIST
-#undef IFMVCV
-#undef CHECK_BETTER
-#undef MIN
-#undef MAX
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+    vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
+              z, src_stride, &sse, second_pred)
 
-int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
-                                 int_mv *bestmv, int_mv *ref_mv,
-                                 int error_per_bit,
-                                 const vp9_variance_fn_ptr_t *vfp,
-                                 int *mvjcost, int *mvcost[2], int *distortion,
-                                 unsigned int *sse1) {
-  int bestmse = INT_MAX;
-  int_mv startmv;
-  int_mv this_mv;
-  int_mv orig_mv;
-  int yrow_movedback = 0, ycol_movedback = 0;
-  uint8_t *z = x->plane[0].src.buf;
-  int src_stride = x->plane[0].src.stride;
-  int left, right, up, down, diag;
+int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
+                                           int_mv *bestmv, int_mv *ref_mv,
+                                           int error_per_bit,
+                                           const vp9_variance_fn_ptr_t *vfp,
+                                           int forced_stop,
+                                           int iters_per_step,
+                                           int *mvjcost, int *mvcost[2],
+                                           int *distortion,
+                                           unsigned int *sse1,
+                                           const uint8_t *second_pred,
+                                           int w, int h) {
+  uint8_t *const z = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  unsigned int besterr = INT_MAX;
   unsigned int sse;
-  int whichdir;
+  unsigned int whichdir;
+  unsigned int halfiters = iters_per_step;
+  unsigned int quarteriters = iters_per_step;
+  unsigned int eighthiters = iters_per_step;
   int thismse;
-  int y_stride;
-  MACROBLOCKD *xd = &x->e_mbd;
 
-  uint8_t *y = xd->plane[0].pre[0].buf +
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
+  uint8_t *const y = xd->plane[0].pre[0].buf +
                (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
                bestmv->as_mv.col;
-  y_stride = xd->plane[0].pre[0].stride;
-
-  // central mv
-  bestmv->as_mv.row <<= 3;
-  bestmv->as_mv.col <<= 3;
-  startmv = *bestmv;
-  orig_mv = *bestmv;
-
-  // calculate central point error
-  bestmse = vfp->vf(y, y_stride, z, src_stride, sse1);
-  *distortion = bestmse;
-  bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
-  // go left then right and check error
-  this_mv.as_mv.row = startmv.as_mv.row;
-  this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
-  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, src_stride, &sse);
-  left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                               error_per_bit);
-
-  if (left < bestmse) {
-    *bestmv = this_mv;
-    bestmse = left;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
 
-  this_mv.as_mv.col += 8;
-  thismse = vfp->svf_halfpix_h(y, y_stride, z, src_stride, &sse);
-  right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                error_per_bit);
-
-  if (right < bestmse) {
-    *bestmv = this_mv;
-    bestmse = right;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  // go up then down and check error
-  this_mv.as_mv.col = startmv.as_mv.col;
-  this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
-  thismse =  vfp->svf_halfpix_v(y - y_stride, y_stride, z, src_stride, &sse);
-  up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
-
-  if (up < bestmse) {
-    *bestmv = this_mv;
-    bestmse = up;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  this_mv.as_mv.row += 8;
-  thismse = vfp->svf_halfpix_v(y, y_stride, z, src_stride, &sse);
-  down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                               error_per_bit);
-
-  if (down < bestmse) {
-    *bestmv = this_mv;
-    bestmse = down;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
+  const int y_stride = xd->plane[0].pre[0].stride;
 
+  int rr = ref_mv->as_mv.row;
+  int rc = ref_mv->as_mv.col;
+  int br = bestmv->as_mv.row << 3;
+  int bc = bestmv->as_mv.col << 3;
+  int hstep = 4;
+  const int minc = MAX(x->mv_col_min << 3, ref_mv->as_mv.col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max << 3, ref_mv->as_mv.col + MV_MAX);
+  const int minr = MAX(x->mv_row_min << 3, ref_mv->as_mv.row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max << 3, ref_mv->as_mv.row + MV_MAX);
 
-  // now check 1 more diagonal
-  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-  // for(whichdir =0;whichdir<4;whichdir++)
-  // {
-  this_mv = startmv;
+  int tr = br;
+  int tc = bc;
 
-  switch (whichdir) {
-    case 0:
-      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
-      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, src_stride,
-                                    &sse);
-      break;
-    case 1:
-      this_mv.as_mv.col += 4;
-      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, src_stride,
-                                    &sse);
-      break;
-    case 2:
-      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
-      this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, src_stride, &sse);
-      break;
-    case 3:
-    default:
-      this_mv.as_mv.col += 4;
-      this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y, y_stride, z, src_stride, &sse);
-      break;
-  }
-
-  diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                               error_per_bit);
-
-  if (diag < bestmse) {
-    *bestmv = this_mv;
-    bestmse = diag;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-//  }
-
-
-  // time to check quarter pels.
-  if (bestmv->as_mv.row < startmv.as_mv.row) {
-    y -= y_stride;
-    yrow_movedback = 1;
-  }
-
-  if (bestmv->as_mv.col < startmv.as_mv.col) {
-    y--;
-    ycol_movedback = 1;
-  }
+  const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
 
-  startmv = *bestmv;
-
-
-
-  // go left then right and check error
-  this_mv.as_mv.row = startmv.as_mv.row;
-
-  if (startmv.as_mv.col & 7) {
-    this_mv.as_mv.col = startmv.as_mv.col - 2;
-    thismse = vfp->svf(y, y_stride,
-                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                       z, src_stride, &sse);
-  } else {
-    this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
-    thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
-                       src_stride, &sse);
-  }
-
-  left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                               error_per_bit);
-
-  if (left < bestmse) {
-    *bestmv = this_mv;
-    bestmse = left;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  this_mv.as_mv.col += 4;
-  thismse = vfp->svf(y, y_stride,
-                     SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                     z, src_stride, &sse);
-  right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                error_per_bit);
-
-  if (right < bestmse) {
-    *bestmv = this_mv;
-    bestmse = right;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  // go up then down and check error
-  this_mv.as_mv.col = startmv.as_mv.col;
-
-  if (startmv.as_mv.row & 7) {
-    this_mv.as_mv.row = startmv.as_mv.row - 2;
-    thismse = vfp->svf(y, y_stride,
-                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                       z, src_stride, &sse);
-  } else {
-    this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
-    thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6),
-                       z, src_stride, &sse);
-  }
-
-  up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
-
-  if (up < bestmse) {
-    *bestmv = this_mv;
-    bestmse = up;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  this_mv.as_mv.row += 4;
-  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                     z, src_stride, &sse);
-  down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                               error_per_bit);
-
-
-  if (down < bestmse) {
-    *bestmv = this_mv;
-    bestmse = down;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-
-  // now check 1 more diagonal
-  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-//  for(whichdir=0;whichdir<4;whichdir++)
-//  {
-  this_mv = startmv;
-
-  switch (whichdir) {
-    case 0:
-
-      if (startmv.as_mv.row & 7) {
-        this_mv.as_mv.row -= 2;
-
-        if (startmv.as_mv.col & 7) {
-          this_mv.as_mv.col -= 2;
-          thismse = vfp->svf(y, y_stride,
-                             SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                             z, src_stride, &sse);
-        } else {
-          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
-          thismse = vfp->svf(y - 1, y_stride,
-                             SP(6), SP(this_mv.as_mv.row), z, src_stride, &sse);
-        }
-      } else {
-        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
-
-        if (startmv.as_mv.col & 7) {
-          this_mv.as_mv.col -= 2;
-          thismse = vfp->svf(y - y_stride, y_stride,
-                             SP(this_mv.as_mv.col), SP(6), z, src_stride, &sse);
-        } else {
-          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
-          thismse = vfp->svf(y - y_stride - 1, y_stride,
-                             SP(6), SP(6), z, src_stride, &sse);
-        }
-      }
-
-      break;
-    case 1:
-      this_mv.as_mv.col += 2;
-
-      if (startmv.as_mv.row & 7) {
-        this_mv.as_mv.row -= 2;
-        thismse = vfp->svf(y, y_stride,
-                           SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                           z, src_stride, &sse);
-      } else {
-        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
-        thismse = vfp->svf(y - y_stride, y_stride,
-                           SP(this_mv.as_mv.col), SP(6), z, src_stride, &sse);
-      }
+  // central mv
+  bestmv->as_mv.row <<= 3;
+  bestmv->as_mv.col <<= 3;
 
-      break;
-    case 2:
-      this_mv.as_mv.row += 2;
-
-      if (startmv.as_mv.col & 7) {
-        this_mv.as_mv.col -= 2;
-        thismse = vfp->svf(y, y_stride,
-                           SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                           z, src_stride, &sse);
-      } else {
-        this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
-        thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
-                           src_stride, &sse);
-      }
+  // calculate central point error
+  // TODO(yunqingwang): central pointer error was already calculated in full-
+  // pixel search, and can be passed in this function.
+  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
+  // Each subsequent iteration checks at least one point in
+  // common with the last iteration could be 2 ( if diag selected)
+  while (halfiters--) {
+    // 1/2 pel
+    FIRST_LEVEL_CHECKS;
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
       break;
-    case 3:
-      this_mv.as_mv.col += 2;
-      this_mv.as_mv.row += 2;
-      thismse = vfp->svf(y, y_stride,
-                         SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                         z, src_stride, &sse);
-      break;
-  }
-
-  diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                               error_per_bit);
-
-  if (diag < bestmse) {
-    *bestmv = this_mv;
-    bestmse = diag;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  if (!(xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv)))
-    return bestmse;
-
-  /* Now do 1/8th pixel */
-  if (bestmv->as_mv.row < orig_mv.as_mv.row && !yrow_movedback) {
-    y -= y_stride;
-    yrow_movedback = 1;
-  }
-
-  if (bestmv->as_mv.col < orig_mv.as_mv.col && !ycol_movedback) {
-    y--;
-    ycol_movedback = 1;
-  }
-
-  startmv = *bestmv;
-
-  // go left then right and check error
-  this_mv.as_mv.row = startmv.as_mv.row;
-
-  if (startmv.as_mv.col & 7) {
-    this_mv.as_mv.col = startmv.as_mv.col - 1;
-    thismse = vfp->svf(y, y_stride,
-                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                       z, src_stride, &sse);
-  } else {
-    this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
-    thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row),
-                       z, src_stride, &sse);
-  }
-
-  left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                               error_per_bit);
-
-  if (left < bestmse) {
-    *bestmv = this_mv;
-    bestmse = left;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  this_mv.as_mv.col += 2;
-  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                     z, src_stride, &sse);
-  right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                error_per_bit);
-
-  if (right < bestmse) {
-    *bestmv = this_mv;
-    bestmse = right;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  // go up then down and check error
-  this_mv.as_mv.col = startmv.as_mv.col;
-
-  if (startmv.as_mv.row & 7) {
-    this_mv.as_mv.row = startmv.as_mv.row - 1;
-    thismse = vfp->svf(y, y_stride,
-                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                       z, src_stride, &sse);
-  } else {
-    this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
-    thismse = vfp->svf(y - y_stride, y_stride,
-                       SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse);
+    tr = br;
+    tc = bc;
   }
 
-  up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
-
-  if (up < bestmse) {
-    *bestmv = this_mv;
-    bestmse = up;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
 
-  this_mv.as_mv.row += 2;
-  thismse = vfp->svf(y, y_stride,
-                     SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                     z, src_stride, &sse);
-  down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                               error_per_bit);
-
-  if (down < bestmse) {
-    *bestmv = this_mv;
-    bestmse = down;
-    *distortion = thismse;
-    *sse1 = sse;
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    while (quarteriters--) {
+      FIRST_LEVEL_CHECKS;
+      // no reason to check the same one again.
+      if (tr == br && tc == bc)
+        break;
+      tr = br;
+      tc = bc;
+    }
   }
 
-  // now check 1 more diagonal
-  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-//  for(whichdir=0;whichdir<4;whichdir++)
-//  {
-  this_mv = startmv;
-
-  switch (whichdir) {
-    case 0:
-
-      if (startmv.as_mv.row & 7) {
-        this_mv.as_mv.row -= 1;
-
-        if (startmv.as_mv.col & 7) {
-          this_mv.as_mv.col -= 1;
-          thismse = vfp->svf(y, y_stride,
-                             SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                             z, src_stride, &sse);
-        } else {
-          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
-          thismse = vfp->svf(y - 1, y_stride,
-                             SP(7), SP(this_mv.as_mv.row),
-                             z, src_stride, &sse);
-        }
-      } else {
-        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
-
-        if (startmv.as_mv.col & 7) {
-          this_mv.as_mv.col -= 1;
-          thismse = vfp->svf(y - y_stride, y_stride,
-                             SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse);
-        } else {
-          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
-          thismse = vfp->svf(y - y_stride - 1, y_stride,
-                             SP(7), SP(7), z, src_stride, &sse);
-        }
-      }
-
-      break;
-    case 1:
-      this_mv.as_mv.col += 1;
-
-      if (startmv.as_mv.row & 7) {
-        this_mv.as_mv.row -= 1;
-        thismse = vfp->svf(y, y_stride,
-                           SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                           z, src_stride, &sse);
-      } else {
-        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
-        thismse = vfp->svf(y - y_stride, y_stride,
-                           SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse);
-      }
-
-      break;
-    case 2:
-      this_mv.as_mv.row += 1;
-
-      if (startmv.as_mv.col & 7) {
-        this_mv.as_mv.col -= 1;
-        thismse = vfp->svf(y, y_stride,
-                           SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                           z, src_stride, &sse);
-      } else {
-        this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
-        thismse = vfp->svf(y - 1, y_stride,
-                           SP(7), SP(this_mv.as_mv.row), z, src_stride, &sse);
-      }
-
-      break;
-    case 3:
-      this_mv.as_mv.col += 1;
-      this_mv.as_mv.row += 1;
-      thismse = vfp->svf(y, y_stride,
-                         SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                         z, src_stride, &sse);
-      break;
+  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
+      forced_stop == 0) {
+    hstep >>= 1;
+    while (eighthiters--) {
+      FIRST_LEVEL_CHECKS;
+      // no reason to check the same one again.
+      if (tr == br && tc == bc)
+        break;
+      tr = br;
+      tc = bc;
+    }
   }
+  bestmv->as_mv.row = br;
+  bestmv->as_mv.col = bc;
 
-  diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                               error_per_bit);
-
-  if (diag < bestmse) {
-    *bestmv = this_mv;
-    bestmse = diag;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
+  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
 
-  return bestmse;
+  return besterr;
 }
 
-#undef SP
-
-int vp9_find_best_half_pixel_step(MACROBLOCK *x,
-                                  int_mv *bestmv, int_mv *ref_mv,
-                                  int error_per_bit,
-                                  const vp9_variance_fn_ptr_t *vfp,
-                                  int *mvjcost, int *mvcost[2],
-                                  int *distortion,
-                                  unsigned int *sse1) {
-  int bestmse = INT_MAX;
-  int_mv startmv;
-  int_mv this_mv;
+int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
+                                      int_mv *bestmv, int_mv *ref_mv,
+                                      int error_per_bit,
+                                      const vp9_variance_fn_ptr_t *vfp,
+                                      int forced_stop,
+                                      int iters_per_step,
+                                      int *mvjcost, int *mvcost[2],
+                                      int *distortion,
+                                      unsigned int *sse1,
+                                      const uint8_t *second_pred,
+                                      int w, int h) {
   uint8_t *z = x->plane[0].src.buf;
   int src_stride = x->plane[0].src.stride;
-  int left, right, up, down, diag;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int rr, rc, br, bc, hstep;
+  int tr, tc;
+  unsigned int besterr = INT_MAX;
   unsigned int sse;
-  int whichdir;
+  unsigned int whichdir;
   int thismse;
+  int maxc, minc, maxr, minr;
   int y_stride;
-  MACROBLOCKD *xd = &x->e_mbd;
+  int offset;
+  unsigned int halfiters = iters_per_step;
+  unsigned int quarteriters = iters_per_step;
+  unsigned int eighthiters = iters_per_step;
 
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
   uint8_t *y = xd->plane[0].pre[0].buf +
-      (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + bestmv->as_mv.col;
+               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
+               bestmv->as_mv.col;
+
   y_stride = xd->plane[0].pre[0].stride;
 
+  rr = ref_mv->as_mv.row;
+  rc = ref_mv->as_mv.col;
+  br = bestmv->as_mv.row << 3;
+  bc = bestmv->as_mv.col << 3;
+  hstep = 4;
+  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
+             ((1 << MV_MAX_BITS) - 1));
+  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
+             ((1 << MV_MAX_BITS) - 1));
+
+  tr = br;
+  tc = bc;
+
+
+  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
   // central mv
   bestmv->as_mv.row <<= 3;
   bestmv->as_mv.col <<= 3;
-  startmv = *bestmv;
 
   // calculate central point error
-  bestmse = vfp->vf(y, y_stride, z, src_stride, sse1);
-  *distortion = bestmse;
-  bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
-  // go left then right and check error
-  this_mv.as_mv.row = startmv.as_mv.row;
-  this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
-  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, src_stride, &sse);
-  left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                               error_per_bit);
-
-  if (left < bestmse) {
-    *bestmv = this_mv;
-    bestmse = left;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  this_mv.as_mv.col += 8;
-  thismse = vfp->svf_halfpix_h(y, y_stride, z, src_stride, &sse);
-  right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                error_per_bit);
-
-  if (right < bestmse) {
-    *bestmv = this_mv;
-    bestmse = right;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
+  // TODO(yunqingwang): central pointer error was already calculated in full-
+  // pixel search, and can be passed in this function.
+  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
-  // go up then down and check error
-  this_mv.as_mv.col = startmv.as_mv.col;
-  this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
-  thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, src_stride, &sse);
-  up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
-
-  if (up < bestmse) {
-    *bestmv = this_mv;
-    bestmse = up;
-    *distortion = thismse;
-    *sse1 = sse;
+  // Each subsequent iteration checks at least one point in
+  // common with the last iteration could be 2 ( if diag selected)
+  // 1/2 pel
+  FIRST_LEVEL_CHECKS;
+  if (halfiters > 1) {
+    SECOND_LEVEL_CHECKS;
   }
+  tr = br;
+  tc = bc;
 
-  this_mv.as_mv.row += 8;
-  thismse = vfp->svf_halfpix_v(y, y_stride, z, src_stride, &sse);
-  down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                               error_per_bit);
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
 
-  if (down < bestmse) {
-    *bestmv = this_mv;
-    bestmse = down;
-    *distortion = thismse;
-    *sse1 = sse;
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
   }
 
-  // now check 1 more diagonal -
-  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-  this_mv = startmv;
-
-  switch (whichdir) {
-    case 0:
-      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
-      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride,
-                                    z, src_stride, &sse);
-      break;
-    case 1:
-      this_mv.as_mv.col += 4;
-      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride,
-                                    z, src_stride, &sse);
-      break;
-    case 2:
-      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
-      this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, src_stride, &sse);
-      break;
-    case 3:
-    default:
-      this_mv.as_mv.col += 4;
-      this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y, y_stride, z, src_stride, &sse);
-      break;
+  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
+      forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
   }
+  bestmv->as_mv.row = br;
+  bestmv->as_mv.col = bc;
 
-  diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                               error_per_bit);
-
-  if (diag < bestmse) {
-    *bestmv = this_mv;
-    bestmse = diag;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
+  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
 
-  return bestmse;
+  return besterr;
 }
 
+#undef MVC
+#undef PRE
+#undef DIST
+#undef IFMVCV
+#undef CHECK_BETTER
+#undef SP
+
 #define CHECK_BOUNDS(range) \
   {\
     all_in = 1;\
@@ -1245,8 +764,10 @@ int vp9_find_best_half_pixel_step(MACROBLOCK *x,
   {\
     if (thissad < bestsad)\
     {\
-      thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost, \
-                                sad_per_bit);\
+      if (use_mvcost) \
+        thissad += mvsad_err_cost(&this_mv, &fcenter_mv, \
+                                  mvjsadcost, mvsadcost, \
+                                  sad_per_bit);\
       if (thissad < bestsad)\
       {\
         bestsad = thissad;\
@@ -1255,46 +776,53 @@ int vp9_find_best_half_pixel_step(MACROBLOCK *x,
     }\
   }
 
-static const MV next_chkpts[6][3] = {
-  {{ -2, 0}, { -1, -2}, {1, -2}},
-  {{ -1, -2}, {1, -2}, {2, 0}},
-  {{1, -2}, {2, 0}, {1, 2}},
-  {{2, 0}, {1, 2}, { -1, 2}},
-  {{1, 2}, { -1, 2}, { -2, 0}},
-  {{ -1, 2}, { -2, 0}, { -1, -2}}
-};
-
-int vp9_hex_search
-(
-  MACROBLOCK *x,
-  int_mv *ref_mv,
-  int_mv *best_mv,
-  int search_param,
-  int sad_per_bit,
-  const vp9_variance_fn_ptr_t *vfp,
-  int *mvjsadcost, int *mvsadcost[2],
-  int *mvjcost, int *mvcost[2],
-  int_mv *center_mv
-) {
+#define get_next_chkpts(list, i, n)   \
+    list[0] = ((i) == 0 ? (n) - 1 : (i) - 1);  \
+    list[1] = (i);                             \
+    list[2] = ((i) == (n) - 1 ? 0 : (i) + 1);
+
+#define MAX_PATTERN_SCALES         11
+#define MAX_PATTERN_CANDIDATES      8  // max number of canddiates per scale
+#define PATTERN_CANDIDATES_REF      3  // number of refinement candidates
+
+// Generic pattern search function that searches over multiple scales.
+// Each scale can have a different number of candidates and shape of
+// candidates as indicated in the num_candidates and candidates arrays
+// passed into this function
+static int vp9_pattern_search(MACROBLOCK *x,
+                              int_mv *ref_mv,
+                              int search_param,
+                              int sad_per_bit,
+                              int do_init_search,
+                              int do_refine,
+                              const vp9_variance_fn_ptr_t *vfp,
+                              int use_mvcost,
+                              int_mv *center_mv, int_mv *best_mv,
+                              const int num_candidates[MAX_PATTERN_SCALES],
+                              const MV candidates[MAX_PATTERN_SCALES]
+                                                 [MAX_PATTERN_CANDIDATES]) {
   const MACROBLOCKD* const xd = &x->e_mbd;
-  MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} };
-  MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
-  int i, j;
-
+  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+  };
+  int i, j, s, t;
   uint8_t *what = x->plane[0].src.buf;
   int what_stride = x->plane[0].src.stride;
   int in_what_stride = xd->plane[0].pre[0].stride;
   int br, bc;
   int_mv this_mv;
-  unsigned int bestsad = 0x7fffffff;
-  unsigned int thissad;
+  int bestsad = INT_MAX;
+  int thissad;
   uint8_t *base_offset;
   uint8_t *this_offset;
   int k = -1;
   int all_in;
   int best_site = -1;
-
   int_mv fcenter_mv;
+  int best_init_s = search_param_to_steps[search_param];
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
   fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
@@ -1306,7 +834,7 @@ int vp9_hex_search
 
   // Work out the start point for the search
   base_offset = (uint8_t *)(xd->plane[0].pre[0].buf);
-  this_offset = base_offset + (br * (xd->plane[0].pre[0].stride)) + bc;
+  this_offset = base_offset + (br * in_what_stride) + bc;
   this_mv.as_mv.row = br;
   this_mv.as_mv.col = bc;
   bestsad = vfp->sdf(what, what_stride, this_offset,
@@ -1314,109 +842,310 @@ int vp9_hex_search
             + mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost,
                              sad_per_bit);
 
-  // hex search
-  // j=0
-  CHECK_BOUNDS(2)
-
-  if (all_in) {
-    for (i = 0; i < 6; i++) {
-      this_mv.as_mv.row = br + hex[i].row;
-      this_mv.as_mv.col = bc + hex[i].col;
-      this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
-      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
-      CHECK_BETTER
+  // Search all possible scales upto the search param around the center point
+  // pick the scale of the point that is best as the starting scale of
+  // further steps around it.
+  if (do_init_search) {
+    s = best_init_s;
+    best_init_s = -1;
+    for (t = 0; t <= s; ++t) {
+      best_site = -1;
+      CHECK_BOUNDS((1 << t))
+      if (all_in) {
+        for (i = 0; i < num_candidates[t]; i++) {
+          this_mv.as_mv.row = br + candidates[t][i].row;
+          this_mv.as_mv.col = bc + candidates[t][i].col;
+          this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
+              this_mv.as_mv.col;
+          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                             bestsad);
+          CHECK_BETTER
+        }
+      } else {
+        for (i = 0; i < num_candidates[t]; i++) {
+          this_mv.as_mv.row = br + candidates[t][i].row;
+          this_mv.as_mv.col = bc + candidates[t][i].col;
+          CHECK_POINT
+          this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
+                        this_mv.as_mv.col;
+          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                             bestsad);
+          CHECK_BETTER
+        }
+      }
+      if (best_site == -1) {
+        continue;
+      } else {
+        best_init_s = t;
+        k = best_site;
+      }
     }
-  } else {
-    for (i = 0; i < 6; i++) {
-      this_mv.as_mv.row = br + hex[i].row;
-      this_mv.as_mv.col = bc + hex[i].col;
-      CHECK_POINT
-      this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
-      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
-      CHECK_BETTER
+    if (best_init_s != -1) {
+      br += candidates[best_init_s][k].row;
+      bc += candidates[best_init_s][k].col;
     }
   }
 
-  if (best_site == -1)
-    goto cal_neighbors;
-  else {
-    br += hex[best_site].row;
-    bc += hex[best_site].col;
-    k = best_site;
-  }
-
-  for (j = 1; j < 127; j++) {
+  // If the center point is still the best, just skip this and move to
+  // the refinement step.
+  if (best_init_s != -1) {
+    s = best_init_s;
     best_site = -1;
-    CHECK_BOUNDS(2)
+    do {
+      // No need to search all 6 points the 1st time if initial search was used
+      if (!do_init_search || s != best_init_s) {
+        CHECK_BOUNDS((1 << s))
+        if (all_in) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            this_mv.as_mv.row = br + candidates[s][i].row;
+            this_mv.as_mv.col = bc + candidates[s][i].col;
+            this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
+                this_mv.as_mv.col;
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                               bestsad);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            this_mv.as_mv.row = br + candidates[s][i].row;
+            this_mv.as_mv.col = bc + candidates[s][i].col;
+            CHECK_POINT
+            this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
+                          this_mv.as_mv.col;
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                               bestsad);
+            CHECK_BETTER
+          }
+        }
 
-    if (all_in) {
-      for (i = 0; i < 3; i++) {
-        this_mv.as_mv.row = br + next_chkpts[k][i].row;
-        this_mv.as_mv.col = bc + next_chkpts[k][i].col;
-        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
-        CHECK_BETTER
-      }
-    } else {
-      for (i = 0; i < 3; i++) {
-        this_mv.as_mv.row = br + next_chkpts[k][i].row;
-        this_mv.as_mv.col = bc + next_chkpts[k][i].col;
-        CHECK_POINT
-        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
-        CHECK_BETTER
+        if (best_site == -1) {
+          continue;
+        } else {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
       }
-    }
 
-    if (best_site == -1)
-      break;
-    else {
-      br += next_chkpts[k][best_site].row;
-      bc += next_chkpts[k][best_site].col;
-      k += 5 + best_site;
-      if (k >= 12) k -= 12;
-      else if (k >= 6) k -= 6;
-    }
+      do {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        CHECK_BOUNDS((1 << s))
+
+        get_next_chkpts(next_chkpts_indices, k, num_candidates[s]);
+        if (all_in) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            this_mv.as_mv.row = br +
+                candidates[s][next_chkpts_indices[i]].row;
+            this_mv.as_mv.col = bc +
+                candidates[s][next_chkpts_indices[i]].col;
+            this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
+                this_mv.as_mv.col;
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                               bestsad);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            this_mv.as_mv.row = br +
+                candidates[s][next_chkpts_indices[i]].row;
+            this_mv.as_mv.col = bc +
+                candidates[s][next_chkpts_indices[i]].col;
+            CHECK_POINT
+            this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
+                          this_mv.as_mv.col;
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                               bestsad);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      } while (best_site != -1);
+    } while (s--);
   }
 
-  // check 4 1-away neighbors
-cal_neighbors:
-  for (j = 0; j < 32; j++) {
-    best_site = -1;
-    CHECK_BOUNDS(1)
+  // Check 4 1-away neighbors if do_refine is true.
+  // For most well-designed schemes do_refine will not be necessary.
+  if (do_refine) {
+    static const MV neighbors[4] = {
+      {0, -1}, { -1, 0}, {1, 0}, {0, 1},
+    };
+    for (j = 0; j < 16; j++) {
+      best_site = -1;
+      CHECK_BOUNDS(1)
+      if (all_in) {
+        for (i = 0; i < 4; i++) {
+          this_mv.as_mv.row = br + neighbors[i].row;
+          this_mv.as_mv.col = bc + neighbors[i].col;
+          this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
+              this_mv.as_mv.col;
+          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                             bestsad);
+          CHECK_BETTER
+        }
+      } else {
+        for (i = 0; i < 4; i++) {
+          this_mv.as_mv.row = br + neighbors[i].row;
+          this_mv.as_mv.col = bc + neighbors[i].col;
+          CHECK_POINT
+          this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
+                        this_mv.as_mv.col;
+          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                             bestsad);
+          CHECK_BETTER
+        }
+          }
 
-    if (all_in) {
-      for (i = 0; i < 4; i++) {
-        this_mv.as_mv.row = br + neighbors[i].row;
-        this_mv.as_mv.col = bc + neighbors[i].col;
-        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
-        CHECK_BETTER
-      }
-    } else {
-      for (i = 0; i < 4; i++) {
-        this_mv.as_mv.row = br + neighbors[i].row;
-        this_mv.as_mv.col = bc + neighbors[i].col;
-        CHECK_POINT
-        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
-        CHECK_BETTER
+      if (best_site == -1) {
+        break;
+      } else {
+        br += neighbors[best_site].row;
+        bc += neighbors[best_site].col;
       }
     }
-
-    if (best_site == -1)
-      break;
-    else {
-      br += neighbors[best_site].row;
-      bc += neighbors[best_site].col;
-    }
   }
 
   best_mv->as_mv.row = br;
   best_mv->as_mv.col = bc;
 
-  return bestsad;
+  this_offset = base_offset + (best_mv->as_mv.row * (in_what_stride)) +
+      best_mv->as_mv.col;
+  this_mv.as_mv.row = best_mv->as_mv.row << 3;
+  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+  if (bestsad == INT_MAX)
+    return INT_MAX;
+  return
+      vfp->vf(what, what_stride, this_offset, in_what_stride,
+              (unsigned int *)(&bestsad)) +
+      use_mvcost ? mv_err_cost(&this_mv, center_mv, x->nmvjointcost, x->mvcost,
+                               x->errorperbit) : 0;
+}
+
+
+int vp9_hex_search(MACROBLOCK *x,
+                   int_mv *ref_mv,
+                   int search_param,
+                   int sad_per_bit,
+                   int do_init_search,
+                   const vp9_variance_fn_ptr_t *vfp,
+                   int use_mvcost,
+                   int_mv *center_mv, int_mv *best_mv) {
+  // First scale has 8-closest points, the rest have 6 points in hex shape
+  // at increasing scales
+  static const int hex_num_candidates[MAX_PATTERN_SCALES] = {
+    8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+    {{-1, -1}, {0, -1}, {1, -1}, {1, 0}, {1, 1}, { 0, 1}, { -1, 1}, {-1, 0}},
+    {{-1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0}},
+    {{-2, -4}, {2, -4}, {4, 0}, {2, 4}, { -2, 4}, { -4, 0}},
+    {{-4, -8}, {4, -8}, {8, 0}, {4, 8}, { -4, 8}, { -8, 0}},
+    {{-8, -16}, {8, -16}, {16, 0}, {8, 16}, { -8, 16}, { -16, 0}},
+    {{-16, -32}, {16, -32}, {32, 0}, {16, 32}, { -16, 32}, { -32, 0}},
+    {{-32, -64}, {32, -64}, {64, 0}, {32, 64}, { -32, 64}, { -64, 0}},
+    {{-64, -128}, {64, -128}, {128, 0}, {64, 128}, { -64, 128}, { -128, 0}},
+    {{-128, -256}, {128, -256}, {256, 0}, {128, 256}, { -128, 256}, { -256, 0}},
+    {{-256, -512}, {256, -512}, {512, 0}, {256, 512}, { -256, 512}, { -512, 0}},
+    {{-512, -1024}, {512, -1024}, {1024, 0}, {512, 1024}, { -512, 1024},
+      { -1024, 0}},
+  };
+  return
+      vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                         do_init_search, 0, vfp, use_mvcost,
+                         center_mv, best_mv,
+                         hex_num_candidates, hex_candidates);
+}
+
+int vp9_bigdia_search(MACROBLOCK *x,
+                      int_mv *ref_mv,
+                      int search_param,
+                      int sad_per_bit,
+                      int do_init_search,
+                      const vp9_variance_fn_ptr_t *vfp,
+                      int use_mvcost,
+                      int_mv *center_mv,
+                      int_mv *best_mv) {
+  // First scale has 4-closest points, the rest have 8 points in diamond
+  // shape at increasing scales
+  static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+    4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  static const MV bigdia_candidates[MAX_PATTERN_SCALES]
+                                   [MAX_PATTERN_CANDIDATES] = {
+    {{0, -1}, {1, 0}, { 0, 1}, {-1, 0}},
+    {{-1, -1}, {0, -2}, {1, -1}, {2, 0}, {1, 1}, {0, 2}, {-1, 1}, {-2, 0}},
+    {{-2, -2}, {0, -4}, {2, -2}, {4, 0}, {2, 2}, {0, 4}, {-2, 2}, {-4, 0}},
+    {{-4, -4}, {0, -8}, {4, -4}, {8, 0}, {4, 4}, {0, 8}, {-4, 4}, {-8, 0}},
+    {{-8, -8}, {0, -16}, {8, -8}, {16, 0}, {8, 8}, {0, 16}, {-8, 8}, {-16, 0}},
+    {{-16, -16}, {0, -32}, {16, -16}, {32, 0}, {16, 16}, {0, 32},
+      {-16, 16}, {-32, 0}},
+    {{-32, -32}, {0, -64}, {32, -32}, {64, 0}, {32, 32}, {0, 64},
+      {-32, 32}, {-64, 0}},
+    {{-64, -64}, {0, -128}, {64, -64}, {128, 0}, {64, 64}, {0, 128},
+      {-64, 64}, {-128, 0}},
+    {{-128, -128}, {0, -256}, {128, -128}, {256, 0}, {128, 128}, {0, 256},
+      {-128, 128}, {-256, 0}},
+    {{-256, -256}, {0, -512}, {256, -256}, {512, 0}, {256, 256}, {0, 512},
+      {-256, 256}, {-512, 0}},
+    {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024},
+      {-512, 512}, {-1024, 0}},
+  };
+  return
+      vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                         do_init_search, 0, vfp, use_mvcost,
+                         center_mv, best_mv,
+                         bigdia_num_candidates, bigdia_candidates);
 }
+
+int vp9_square_search(MACROBLOCK *x,
+                      int_mv *ref_mv,
+                      int search_param,
+                      int sad_per_bit,
+                      int do_init_search,
+                      const vp9_variance_fn_ptr_t *vfp,
+                      int use_mvcost,
+                      int_mv *center_mv,
+                      int_mv *best_mv) {
+  // All scales have 8 closest points in square shape
+  static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  static const MV square_candidates[MAX_PATTERN_SCALES]
+                                   [MAX_PATTERN_CANDIDATES] = {
+    {{-1, -1}, {0, -1}, {1, -1}, {1, 0}, {1, 1}, {0, 1}, {-1, 1}, {-1, 0}},
+    {{-2, -2}, {0, -2}, {2, -2}, {2, 0}, {2, 2}, {0, 2}, {-2, 2}, {-2, 0}},
+    {{-4, -4}, {0, -4}, {4, -4}, {4, 0}, {4, 4}, {0, 4}, {-4, 4}, {-4, 0}},
+    {{-8, -8}, {0, -8}, {8, -8}, {8, 0}, {8, 8}, {0, 8}, {-8, 8}, {-8, 0}},
+    {{-16, -16}, {0, -16}, {16, -16}, {16, 0}, {16, 16}, {0, 16},
+      {-16, 16}, {-16, 0}},
+    {{-32, -32}, {0, -32}, {32, -32}, {32, 0}, {32, 32}, {0, 32},
+      {-32, 32}, {-32, 0}},
+    {{-64, -64}, {0, -64}, {64, -64}, {64, 0}, {64, 64}, {0, 64},
+      {-64, 64}, {-64, 0}},
+    {{-128, -128}, {0, -128}, {128, -128}, {128, 0}, {128, 128}, {0, 128},
+      {-128, 128}, {-128, 0}},
+    {{-256, -256}, {0, -256}, {256, -256}, {256, 0}, {256, 256}, {0, 256},
+      {-256, 256}, {-256, 0}},
+    {{-512, -512}, {0, -512}, {512, -512}, {512, 0}, {512, 512}, {0, 512},
+      {-512, 512}, {-512, 0}},
+    {{-1024, -1024}, {0, -1024}, {1024, -1024}, {1024, 0}, {1024, 1024},
+      {0, 1024}, {-1024, 1024}, {-1024, 0}},
+  };
+  return
+      vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                         do_init_search, 0, vfp, use_mvcost,
+                         center_mv, best_mv,
+                         square_num_candidates, square_candidates);
+};
+
 #undef CHECK_BOUNDS
 #undef CHECK_POINT
 #undef CHECK_BETTER
@@ -1808,7 +1537,7 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
   int in_what_stride = xd->plane[0].pre[0].stride;
   int mv_stride = xd->plane[0].pre[0].stride;
   uint8_t *bestaddress;
-  int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0];
+  int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0];
   int_mv this_mv;
   int bestsad = INT_MAX;
   int r, c;
@@ -1844,18 +1573,12 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
             + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
                              sad_per_bit);
 
-  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
-  if (col_min < x->mv_col_min)
-    col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max)
-    col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min)
-    row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max)
-    row_max = x->mv_row_max;
+  // Apply further limits to prevent us looking using vectors that stretch
+  // beyond the UMV border
+  col_min = MAX(col_min, x->mv_col_min);
+  col_max = MIN(col_max, x->mv_col_max);
+  row_min = MAX(row_min, x->mv_row_min);
+  row_max = MIN(row_max, x->mv_row_max);
 
   for (r = row_min; r < row_max; r++) {
     this_mv.as_mv.row = r;
@@ -1902,7 +1625,7 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
   int in_what_stride = xd->plane[0].pre[0].stride;
   int mv_stride = xd->plane[0].pre[0].stride;
   uint8_t *bestaddress;
-  int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0];
+  int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0];
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
@@ -1940,18 +1663,12 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
             + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
                              sad_per_bit);
 
-  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
-  if (col_min < x->mv_col_min)
-    col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max)
-    col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min)
-    row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max)
-    row_max = x->mv_row_max;
+  // Apply further limits to prevent us looking using vectors that stretch
+  // beyond the UMV border
+  col_min = MAX(col_min, x->mv_col_min);
+  col_max = MIN(col_max, x->mv_col_max);
+  row_min = MAX(row_min, x->mv_row_min);
+  row_max = MIN(row_max, x->mv_row_max);
 
   for (r = row_min; r < row_max; r++) {
     this_mv.as_mv.row = r;
@@ -2030,7 +1747,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
   int in_what_stride = xd->plane[0].pre[0].stride;
   int mv_stride = xd->plane[0].pre[0].stride;
   uint8_t *bestaddress;
-  int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0];
+  int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0];
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
@@ -2069,18 +1786,12 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
             + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
                              sad_per_bit);
 
-  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
-  if (col_min < x->mv_col_min)
-    col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max)
-    col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min)
-    row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max)
-    row_max = x->mv_row_max;
+  // Apply further limits to prevent us looking using vectors that stretch
+  // beyond the UMV border
+  col_min = MAX(col_min, x->mv_col_min);
+  col_max = MIN(col_max, x->mv_col_max);
+  row_min = MAX(row_min, x->mv_row_min);
+  row_max = MIN(row_max, x->mv_row_max);
 
   for (r = row_min; r < row_max; r++) {
     this_mv.as_mv.row = r;
@@ -2113,7 +1824,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
       }
     }
 
-    while ((c + 2) < col_max) {
+    while ((c + 2) < col_max && fn_ptr->sdx3f != NULL) {
       int i;
 
       fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h
index 097d33c..3598fa0 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/libvpx/vp9/encoder/vp9_mcomp.h
@@ -23,7 +23,7 @@
 // Maximum size of the first step in full pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
 
-void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
+void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv);
 int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,
                            int *mvcost[2], int weight);
 void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
@@ -40,19 +40,61 @@ int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
                            int_mv *ref_mv, int_mv *dst_mv);
 
 int vp9_hex_search(MACROBLOCK *x,
-                   int_mv *ref_mv, int_mv *best_mv,
-                   int search_param, int error_per_bit,
+                   int_mv *ref_mv,
+                   int search_param,
+                   int error_per_bit,
+                   int do_init_search,
                    const vp9_variance_fn_ptr_t *vf,
-                   int *mvjsadcost, int *mvsadcost[2],
-                   int *mvjcost, int *mvcost[2],
-                   int_mv *center_mv);
+                   int use_mvcost,
+                   int_mv *center_mv,
+                   int_mv *best_mv);
+int vp9_bigdia_search(MACROBLOCK *x,
+                      int_mv *ref_mv,
+                      int search_param,
+                      int error_per_bit,
+                      int do_init_search,
+                      const vp9_variance_fn_ptr_t *vf,
+                      int use_mvcost,
+                      int_mv *center_mv,
+                      int_mv *best_mv);
+int vp9_square_search(MACROBLOCK *x,
+                      int_mv *ref_mv,
+                      int search_param,
+                      int error_per_bit,
+                      int do_init_search,
+                      const vp9_variance_fn_ptr_t *vf,
+                      int use_mvcost,
+                      int_mv *center_mv,
+                      int_mv *best_mv);
 
-typedef int (fractional_mv_step_fp) (MACROBLOCK *x, int_mv
-  *bestmv, int_mv *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp,
-  int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse);
-extern fractional_mv_step_fp vp9_find_best_sub_pixel_step_iteratively;
-extern fractional_mv_step_fp vp9_find_best_sub_pixel_step;
-extern fractional_mv_step_fp vp9_find_best_half_pixel_step;
+typedef int (fractional_mv_step_fp) (
+    MACROBLOCK *x,
+    int_mv *bestmv,
+    int_mv *ref_mv,
+    int error_per_bit,
+    const vp9_variance_fn_ptr_t *vfp,
+    int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
+    int iters_per_step,
+    int *mvjcost,
+    int *mvcost[2],
+    int *distortion,
+    unsigned int *sse);
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_iterative;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
+
+typedef int (fractional_mv_step_comp_fp) (
+    MACROBLOCK *x,
+    int_mv *bestmv, int_mv *ref_mv,
+    int error_per_bit,
+    const vp9_variance_fn_ptr_t *vfp,
+    int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
+    int iters_per_step,
+    int *mvjcost, int *mvcost[2],
+    int *distortion, unsigned int *sse1,
+    const uint8_t *second_pred,
+    int w, int h);
+extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_iterative;
+extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree;
 
 typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x,
                                     int_mv *ref_mv, int sad_per_bit,
@@ -75,15 +117,6 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,
                                        int *mvjcost, int *mvcost[2],
                                        int_mv *center_mv);
 
-int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
-                                 int_mv *bestmv, int_mv *ref_mv,
-                                 int error_per_bit,
-                                 const vp9_variance_fn_ptr_t *vfp,
-                                 int *mvjcost, int *mvcost[2],
-                                 int *distortion, unsigned int *sse1,
-                                 const uint8_t *second_pred,
-                                 int w, int h);
-
 int vp9_refining_search_8p_c(MACROBLOCK *x,
                              int_mv *ref_mv, int error_per_bit,
                              int search_range, vp9_variance_fn_ptr_t *fn_ptr,
diff --git a/libvpx/vp9/encoder/vp9_modecosts.c b/libvpx/vp9/encoder/vp9_modecosts.c
index 993aba7..a5dfaed 100644
--- a/libvpx/vp9/encoder/vp9_modecosts.c
+++ b/libvpx/vp9/encoder/vp9_modecosts.c
@@ -16,28 +16,28 @@
 
 
 void vp9_init_mode_costs(VP9_COMP *c) {
-  VP9_COMMON *x = &c->common;
+  VP9_COMMON *const cm = &c->common;
   const vp9_tree_p KT = vp9_intra_mode_tree;
   int i, j;
 
-  for (i = 0; i < VP9_INTRA_MODES; i++) {
-    for (j = 0; j < VP9_INTRA_MODES; j++) {
+  for (i = 0; i < INTRA_MODES; i++) {
+    for (j = 0; j < INTRA_MODES; j++) {
       vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
                       KT);
     }
   }
 
   // TODO(rbultje) separate tables for superblock costing?
-  vp9_cost_tokens(c->mb.mbmode_cost, x->fc.y_mode_prob[1],
+  vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
                   vp9_intra_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
-                  x->fc.uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree);
+                  cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
-                  vp9_kf_uv_mode_prob[VP9_INTRA_MODES - 1],
+                  vp9_kf_uv_mode_prob[INTRA_MODES - 1],
                   vp9_intra_mode_tree);
 
-  for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
+  for (i = 0; i <= SWITCHABLE_FILTERS; ++i)
     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
-                    x->fc.switchable_interp_prob[i],
+                    cm->fc.switchable_interp_prob[i],
                     vp9_switchable_interp_tree);
 }
diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c
index db03995..883b31e 100644
--- a/libvpx/vp9/encoder/vp9_onyx_if.c
+++ b/libvpx/vp9/encoder/vp9_onyx_if.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -28,7 +28,7 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vpx_mem/vpx_mem.h"
@@ -49,14 +49,10 @@
 
 extern void print_tree_update_probs();
 
-static void set_default_lf_deltas(VP9_COMP *cpi);
+static void set_default_lf_deltas(struct loopfilter *lf);
 
 #define DEFAULT_INTERP_FILTER SWITCHABLE
 
-#define SEARCH_BEST_FILTER 0            /* to search exhaustively for
-                                           best filter */
-#define RESET_FOREACH_FILTER 0          /* whether to reset the encoder state
-                                           before trying each new filter */
 #define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */
 
 #define ALTREF_HIGH_PRECISION_MV 1      /* whether to use high precision mv
@@ -98,15 +94,11 @@ FILE *keyfile;
 
 
 #ifdef ENTROPY_STATS
-extern int intra_mode_stats[VP9_INTRA_MODES]
-                           [VP9_INTRA_MODES]
-                           [VP9_INTRA_MODES];
+extern int intra_mode_stats[INTRA_MODES]
+                           [INTRA_MODES]
+                           [INTRA_MODES];
 #endif
 
-#ifdef NMV_STATS
-extern void init_nmvstats();
-extern void print_nmvstats();
-#endif
 #ifdef MODE_STATS
 extern void init_tx_count_stats();
 extern void write_tx_count_stats();
@@ -241,10 +233,9 @@ void vp9_initialize_enc() {
   }
 }
 
-static void setup_features(VP9_COMP *cpi) {
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  struct loopfilter *const lf = &xd->lf;
-  struct segmentation *const seg = &xd->seg;
+static void setup_features(VP9_COMMON *cm) {
+  struct loopfilter *const lf = &cm->lf;
+  struct segmentation *const seg = &cm->seg;
 
   // Set up default state for MB feature flags
   seg->enabled = 0;
@@ -262,7 +253,7 @@ static void setup_features(VP9_COMP *cpi) {
   vp9_zero(lf->last_ref_deltas);
   vp9_zero(lf->last_mode_deltas);
 
-  set_default_lf_deltas(cpi);
+  set_default_lf_deltas(lf);
 }
 
 static void dealloc_compressor_data(VP9_COMP *cpi) {
@@ -324,8 +315,7 @@ static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
 
 static void configure_static_seg_features(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  struct segmentation *seg = &xd->seg;
+  struct segmentation *seg = &cm->seg;
 
   int high_q = (int)(cpi->avg_q > 48.0);
   int qi_delta;
@@ -450,9 +440,9 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
 void vp9_update_mode_context_stats(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int i, j;
-  unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
+  unsigned int (*inter_mode_counts)[INTER_MODES - 1][2] =
       cm->fc.inter_mode_counts;
-  int64_t (*mv_ref_stats)[VP9_INTER_MODES - 1][2] = cpi->mv_ref_stats;
+  int64_t (*mv_ref_stats)[INTER_MODES - 1][2] = cpi->mv_ref_stats;
   FILE *f;
 
   // Read the past stats counters
@@ -466,7 +456,7 @@ void vp9_update_mode_context_stats(VP9_COMP *cpi) {
 
   // Add in the values for this frame
   for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-    for (j = 0; j < VP9_INTER_MODES - 1; j++) {
+    for (j = 0; j < INTER_MODES - 1; j++) {
       mv_ref_stats[i][j][0] += (int64_t)inter_mode_counts[i][j][0];
       mv_ref_stats[i][j][1] += (int64_t)inter_mode_counts[i][j][1];
     }
@@ -485,12 +475,12 @@ void print_mode_context(VP9_COMP *cpi) {
   fprintf(f, "#include \"vp9_entropy.h\"\n");
   fprintf(
       f,
-      "const int inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1] =");
+      "const int inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1] =");
   fprintf(f, "{\n");
   for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
     fprintf(f, "  {/* %d */ ", j);
     fprintf(f, "    ");
-    for (i = 0; i < VP9_INTER_MODES - 1; i++) {
+    for (i = 0; i < INTER_MODES - 1; i++) {
       int this_prob;
       int64_t count = cpi->mv_ref_stats[j][i][0] + cpi->mv_ref_stats[j][i][1];
       if (count)
@@ -533,22 +523,20 @@ static void print_seg_map(VP9_COMP *cpi) {
 static void update_reference_segmentation_map(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int row, col;
-  MODE_INFO *mi, *mi_ptr = cm->mi;
+  MODE_INFO **mi_8x8, **mi_8x8_ptr = cm->mi_grid_visible;
   uint8_t *cache_ptr = cm->last_frame_seg_map, *cache;
 
   for (row = 0; row < cm->mi_rows; row++) {
-    mi = mi_ptr;
+    mi_8x8 = mi_8x8_ptr;
     cache = cache_ptr;
-    for (col = 0; col < cm->mi_cols; col++, mi++, cache++)
-      cache[0] = mi->mbmi.segment_id;
-    mi_ptr += cm->mode_info_stride;
+    for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
+      cache[0] = mi_8x8[0]->mbmi.segment_id;
+    mi_8x8_ptr += cm->mode_info_stride;
     cache_ptr += cm->mi_cols;
   }
 }
 
-static void set_default_lf_deltas(VP9_COMP *cpi) {
-  struct loopfilter *lf = &cpi->mb.e_mbd.lf;
-
+static void set_default_lf_deltas(struct loopfilter *lf) {
   lf->mode_ref_delta_enabled = 1;
   lf->mode_ref_delta_update = 1;
 
@@ -565,9 +553,8 @@ static void set_default_lf_deltas(VP9_COMP *cpi) {
   lf->mode_deltas[1] = 0;   // New mv
 }
 
-static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
+static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
   SPEED_FEATURES *sf = &cpi->sf;
-  int speed_multiplier = speed + 1;
   int i;
 
   // Set baseline threshold values
@@ -578,46 +565,46 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
   sf->thresh_mult[THR_NEARESTG] = 0;
   sf->thresh_mult[THR_NEARESTA] = 0;
 
-  sf->thresh_mult[THR_NEWMV] += speed_multiplier * 1000;
-  sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1000;
-  sf->thresh_mult[THR_NEARMV] += speed_multiplier * 1000;
-  sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1000;
-
-  sf->thresh_mult[THR_DC] += speed_multiplier * 1000;
-
-  sf->thresh_mult[THR_NEWG] += speed_multiplier * 1000;
-  sf->thresh_mult[THR_NEWA] += speed_multiplier * 1000;
-  sf->thresh_mult[THR_NEARA] += speed_multiplier * 1000;
-
-  sf->thresh_mult[THR_TM] += speed_multiplier * 1000;
-
-  sf->thresh_mult[THR_COMP_NEARLA] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_NEWLA] += speed_multiplier * 2000;
-  sf->thresh_mult[THR_NEARG] += speed_multiplier * 1000;
-  sf->thresh_mult[THR_COMP_NEARGA] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_NEWGA] += speed_multiplier * 2000;
-
-  sf->thresh_mult[THR_SPLITMV] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_SPLITG] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_SPLITA] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_COMP_SPLITLA] += speed_multiplier * 4500;
-  sf->thresh_mult[THR_COMP_SPLITGA] += speed_multiplier * 4500;
-
-  sf->thresh_mult[THR_ZEROMV] += speed_multiplier * 2000;
-  sf->thresh_mult[THR_ZEROG] += speed_multiplier * 2000;
-  sf->thresh_mult[THR_ZEROA] += speed_multiplier * 2000;
-  sf->thresh_mult[THR_COMP_ZEROLA] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_COMP_ZEROGA] += speed_multiplier * 2500;
-
-  sf->thresh_mult[THR_B_PRED] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_H_PRED] += speed_multiplier * 2000;
-  sf->thresh_mult[THR_V_PRED] += speed_multiplier * 2000;
-  sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_D27_PRED] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_D63_PRED] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_NEWMV] += 1000;
+  sf->thresh_mult[THR_COMP_NEARESTLA] += 1000;
+  sf->thresh_mult[THR_NEARMV] += 1000;
+  sf->thresh_mult[THR_COMP_NEARESTGA] += 1000;
+
+  sf->thresh_mult[THR_DC] += 1000;
+
+  sf->thresh_mult[THR_NEWG] += 1000;
+  sf->thresh_mult[THR_NEWA] += 1000;
+  sf->thresh_mult[THR_NEARA] += 1000;
+
+  sf->thresh_mult[THR_TM] += 1000;
+
+  sf->thresh_mult[THR_COMP_NEARLA] += 1500;
+  sf->thresh_mult[THR_COMP_NEWLA] += 2000;
+  sf->thresh_mult[THR_NEARG] += 1000;
+  sf->thresh_mult[THR_COMP_NEARGA] += 1500;
+  sf->thresh_mult[THR_COMP_NEWGA] += 2000;
+
+  sf->thresh_mult[THR_SPLITMV] += 2500;
+  sf->thresh_mult[THR_SPLITG] += 2500;
+  sf->thresh_mult[THR_SPLITA] += 2500;
+  sf->thresh_mult[THR_COMP_SPLITLA] += 4500;
+  sf->thresh_mult[THR_COMP_SPLITGA] += 4500;
+
+  sf->thresh_mult[THR_ZEROMV] += 2000;
+  sf->thresh_mult[THR_ZEROG] += 2000;
+  sf->thresh_mult[THR_ZEROA] += 2000;
+  sf->thresh_mult[THR_COMP_ZEROLA] += 2500;
+  sf->thresh_mult[THR_COMP_ZEROGA] += 2500;
+
+  sf->thresh_mult[THR_B_PRED] += 2500;
+  sf->thresh_mult[THR_H_PRED] += 2000;
+  sf->thresh_mult[THR_V_PRED] += 2000;
+  sf->thresh_mult[THR_D45_PRED ] += 2500;
+  sf->thresh_mult[THR_D135_PRED] += 2500;
+  sf->thresh_mult[THR_D117_PRED] += 2500;
+  sf->thresh_mult[THR_D153_PRED] += 2500;
+  sf->thresh_mult[THR_D207_PRED] += 2500;
+  sf->thresh_mult[THR_D63_PRED] += 2500;
 
   if (cpi->sf.skip_lots_of_modes) {
     for (i = 0; i < MAX_MODES; ++i)
@@ -713,9 +700,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->search_method = NSTEP;
   sf->auto_filter = 1;
   sf->recode_loop = 1;
-  sf->quarter_pixel_search = 1;
-  sf->half_pixel_search = 1;
-  sf->iterative_sub_pixel = 1;
+  sf->subpel_search_method = SUBPEL_TREE;
+  sf->subpel_iters_per_step = 2;
   sf->optimize_coefficients = !cpi->oxcf.lossless;
   sf->reduce_first_step_size = 0;
   sf->auto_mv_step_size = 0;
@@ -724,11 +710,11 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->adaptive_rd_thresh = 0;
   sf->use_lastframe_partitioning = 0;
   sf->tx_size_search_method = USE_FULL_RD;
-  sf->use_8tap_always = 0;
+  sf->use_lp32x32fdct = 0;
+  sf->adaptive_motion_search = 0;
   sf->use_avoid_tested_higherror = 0;
   sf->reference_masking = 0;
   sf->skip_lots_of_modes = 0;
-  sf->adjust_thresholds_by_speed = 0;
   sf->partition_by_variance = 0;
   sf->use_one_partition_size_always = 0;
   sf->less_rectangular_check = 0;
@@ -736,22 +722,23 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->auto_min_max_partition_size = 0;
   sf->auto_min_max_partition_interval = 0;
   sf->auto_min_max_partition_count = 0;
-  // sf->use_max_partition_size = 0;
   sf->max_partition_size = BLOCK_64X64;
-  // sf->use_min_partition_size = 0;
   sf->min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
   sf->last_partitioning_redo_frequency = 4;
   sf->disable_splitmv = 0;
   sf->mode_search_skip_flags = 0;
-  sf->last_chroma_intra_mode = TM_PRED;
+  sf->disable_split_var_thresh = 0;
+  sf->disable_filter_search_var_thresh = 0;
+  sf->intra_y_mode_mask = ALL_INTRA_MODES;
+  sf->intra_uv_mode_mask = ALL_INTRA_MODES;
   sf->use_rd_breakout = 0;
   sf->skip_encode_sb = 0;
   sf->use_uv_intra_rd_estimate = 0;
+  sf->use_fast_lpf_pick = 0;
+  sf->use_fast_coef_updates = 0;
   sf->using_small_partition_info = 0;
-  // Skip any mode not chosen at size < X for all sizes > X
-  // Hence BLOCK_64X64 (skip is off)
-  sf->unused_mode_skip_lvl = BLOCK_64X64;
+  sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
 
 #if CONFIG_MULTIPLE_ARF
   // Switch segmentation off.
@@ -762,7 +749,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 
   switch (mode) {
     case 0: // best quality mode
-      sf->search_best_filter = SEARCH_BEST_FILTER;
       break;
 
     case 1:
@@ -773,9 +759,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
       sf->static_segmentation = 0;
 #endif
       sf->use_avoid_tested_higherror = 1;
-      sf->adaptive_rd_thresh = 1;
+      sf->adaptive_rd_thresh = MIN((speed + 1), 4);
+
       if (speed == 1) {
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
         sf->less_rectangular_check  = 1;
         sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
                                       cpi->common.intra_only ||
@@ -787,7 +774,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
                                    cpi->common.show_frame == 0);
         sf->disable_splitmv =
             (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
-        sf->unused_mode_skip_lvl = BLOCK_32X32;
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
@@ -795,22 +781,30 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->use_uv_intra_rd_estimate = 1;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
+        sf->use_lp32x32fdct = 1;
+        sf->adaptive_motion_search = 1;
         sf->auto_mv_step_size = 1;
 
         sf->auto_min_max_partition_size = 1;
-        // sf->use_max_partition_size = 1;
-        // sf->use_min_partition_size = 1;
         sf->auto_min_max_partition_interval = 1;
+        // FIXME(jingning): temporarily turn off disable_split_var_thresh
+        // during refactoring process. will get this back after finishing
+        // the main framework of partition search type.
+        sf->disable_split_var_thresh = 0;
+        sf->disable_filter_search_var_thresh = 16;
+
+        sf->intra_y_mode_mask = INTRA_DC_TM_H_V;
+        sf->intra_uv_mode_mask = INTRA_DC_TM_H_V;
+        sf->use_fast_coef_updates = 1;
+        sf->mode_skip_start = 9;
       }
       if (speed == 2) {
-        sf->adjust_thresholds_by_speed = 1;
         sf->less_rectangular_check  = 1;
         sf->use_square_partition_only = 1;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
         sf->use_lastframe_partitioning = 1;
         sf->adjust_partitioning_from_last_frame = 1;
         sf->last_partitioning_redo_frequency = 3;
-        sf->unused_mode_skip_lvl = BLOCK_32X32;
         sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
                                       cpi->common.intra_only ||
                                       cpi->common.show_frame == 0) ?
@@ -822,17 +816,29 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
                                      FLAG_SKIP_COMP_REFMISMATCH |
                                      FLAG_SKIP_INTRA_LOWVAR |
                                      FLAG_EARLY_TERMINATE;
-        sf->last_chroma_intra_mode = DC_PRED;
+        sf->intra_y_mode_mask = INTRA_DC_TM;
+        sf->intra_uv_mode_mask = INTRA_DC_TM;
         sf->use_uv_intra_rd_estimate = 1;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
-        sf->using_small_partition_info = 1;
+        sf->use_lp32x32fdct = 1;
+        sf->adaptive_motion_search = 1;
+        sf->using_small_partition_info = 0;
         sf->disable_splitmv =
             (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
         sf->auto_mv_step_size = 1;
+        sf->search_method = SQUARE;
+        sf->subpel_iters_per_step = 1;
+        sf->use_fast_lpf_pick = 1;
+        sf->auto_min_max_partition_size = 1;
+        sf->auto_min_max_partition_interval = 2;
+        sf->disable_split_var_thresh = 32;
+        sf->disable_filter_search_var_thresh = 32;
+        sf->use_fast_coef_updates = 2;
+        sf->mode_skip_start = 9;
       }
       if (speed == 3) {
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
         sf->partition_by_variance = 1;
         sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
                                       cpi->common.intra_only ||
@@ -847,11 +853,20 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
                                      FLAG_EARLY_TERMINATE;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
+        sf->use_lp32x32fdct = 1;
         sf->disable_splitmv = 1;
         sf->auto_mv_step_size = 1;
+        sf->search_method = BIGDIA;
+        sf->subpel_iters_per_step = 1;
+        sf->disable_split_var_thresh = 64;
+        sf->disable_filter_search_var_thresh = 64;
+        sf->intra_y_mode_mask = INTRA_DC_ONLY;
+        sf->intra_uv_mode_mask = INTRA_DC_ONLY;
+        sf->use_fast_coef_updates = 2;
+        sf->mode_skip_start = 9;
       }
       if (speed == 4) {
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
         sf->use_one_partition_size_always = 1;
         sf->always_this_block_size = BLOCK_16X16;
         sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
@@ -866,37 +881,28 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
                                      FLAG_SKIP_INTRA_LOWVAR |
                                      FLAG_EARLY_TERMINATE;
         sf->use_rd_breakout = 1;
+        sf->use_lp32x32fdct = 1;
         sf->optimize_coefficients = 0;
         sf->auto_mv_step_size = 1;
         // sf->reduce_first_step_size = 1;
         // sf->reference_masking = 1;
 
         sf->disable_splitmv = 1;
+        sf->search_method = HEX;
+        sf->subpel_iters_per_step = 1;
+        sf->disable_split_var_thresh = 64;
+        sf->disable_filter_search_var_thresh = 96;
+        sf->intra_y_mode_mask = INTRA_DC_ONLY;
+        sf->intra_uv_mode_mask = INTRA_DC_ONLY;
+        sf->use_fast_coef_updates = 2;
+        sf->mode_skip_start = 9;
       }
-      /*
-      if (speed == 2) {
-        sf->first_step = 0;
-        sf->comp_inter_joint_search_thresh = BLOCK_8X8;
-        sf->use_max_partition_size = 1;
-        sf->max_partition_size = BLOCK_16X16;
-      }
-      if (speed == 3) {
-        sf->first_step = 0;
-        sf->comp_inter_joint_search_thresh = BLOCK_B8X8;
-        sf->use_min_partition_size = 1;
-        sf->min_partition_size = BLOCK_8X8;
-      }
-      */
-
       break;
 
   }; /* switch */
 
   // Set rd thresholds based on mode and speed setting
-  if (cpi->sf.adjust_thresholds_by_speed)
-    set_rd_speed_thresholds(cpi, mode, speed);
-  else
-    set_rd_speed_thresholds(cpi, mode, 0);
+  set_rd_speed_thresholds(cpi, mode);
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
@@ -915,12 +921,12 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 
   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
 
-  if (cpi->sf.iterative_sub_pixel == 1) {
-    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step_iteratively;
-  } else if (cpi->sf.quarter_pixel_search) {
-    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step;
-  } else if (cpi->sf.half_pixel_search) {
-    cpi->find_fractional_mv_step = vp9_find_best_half_pixel_step;
+  if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_iterative;
+    cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_iterative;
+  } else if (cpi->sf.subpel_search_method == SUBPEL_TREE) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
+    cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_tree;
   }
 
   cpi->mb.optimize = cpi->sf.optimize_coefficients == 1 && cpi->pass != 1;
@@ -1163,6 +1169,9 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cpi->gld_fb_idx = 1;
   cpi->alt_fb_idx = 2;
 
+  cpi->current_layer = 0;
+  cpi->use_svc = 0;
+
   set_tile_limits(cpi);
 
   cpi->fixed_divide[0] = 0;
@@ -1227,7 +1236,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cm->refresh_frame_context = 1;
   cm->reset_frame_context = 0;
 
-  setup_features(cpi);
+  setup_features(cm);
   cpi->mb.e_mbd.allow_high_precision_mv = 0;   // Default mv precision adaptation
   set_mvcost(&cpi->mb);
 
@@ -1297,7 +1306,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
   cpi->oxcf.Sharpness = MIN(7, cpi->oxcf.Sharpness);
 
-  cpi->mb.e_mbd.lf.sharpness_level = cpi->oxcf.Sharpness;
+  cpi->common.lf.sharpness_level = cpi->oxcf.Sharpness;
 
   if (cpi->initial_width) {
     // Increasing the size of the frame beyond the first seen frame, or some
@@ -1382,7 +1391,7 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
 }
 
 VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
-  int i;
+  int i, j;
   volatile union {
     VP9_COMP *cpi;
     VP9_PTR   ptr;
@@ -1433,14 +1442,13 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   cpi->alt_is_last  = 0;
   cpi->gold_is_alt  = 0;
 
+  // Spatial scalability
+  cpi->number_spatial_layers = oxcf->ss_number_layers;
+
   // Create the encoder segmentation map and set all entries to 0
   CHECK_MEM_ERROR(cm, cpi->segmentation_map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
-  // And a copy in common for temporal coding
-  CHECK_MEM_ERROR(cm, cm->last_frame_seg_map,
-                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
-
   // And a place holder structure is the coding context
   // for use if we want to save and restore it
   CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
@@ -1462,9 +1470,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
     init_context_counters();
 #endif
 
-#ifdef NMV_STATS
-  init_nmvstats();
-#endif
 #ifdef MODE_STATS
   init_tx_count_stats();
   init_switchable_interp_stats();
@@ -1576,6 +1581,8 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
 
   cpi->output_pkt_list = oxcf->output_pkt_list;
 
+  cpi->enable_encode_breakout = 1;
+
   if (cpi->pass == 1) {
     vp9_init_first_pass(cpi);
   } else if (cpi->pass == 2) {
@@ -1591,9 +1598,10 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
 
   vp9_set_speed_features(cpi);
 
-  // Set starting values of RD threshold multipliers (128 = *1)
-  for (i = 0; i < MAX_MODES; i++)
-    cpi->rd_thresh_mult[i] = 128;
+  // Default rd threshold factors for mode selection
+  for (i = 0; i < BLOCK_SIZES; ++i)
+    for (j = 0; j < MAX_MODES; ++j)
+      cpi->rd_thresh_freq_fact[i][j] = 32;
 
 #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
             SDX3F, SDX8F, SDX4DF)\
@@ -1700,12 +1708,16 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
    */
   vp9_init_quantizer(cpi);
 
-  vp9_loop_filter_init(cm, &cpi->mb.e_mbd.lf);
+  vp9_loop_filter_init(cm);
 
   cpi->common.error.setjmp = 0;
 
   vp9_zero(cpi->y_uv_mode_count)
 
+#ifdef MODE_TEST_HIT_STATS
+  vp9_zero(cpi->mode_test_hits)
+#endif
+
   return (VP9_PTR) cpi;
 }
 
@@ -1728,10 +1740,7 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
       print_mode_context(cpi);
     }
 #endif
-#ifdef NMV_STATS
-    if (cpi->pass != 1)
-      print_nmvstats();
-#endif
+
 #ifdef MODE_STATS
     if (cpi->pass != 1) {
       write_tx_count_stats();
@@ -1790,6 +1799,34 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
 
 #endif
 
+#ifdef MODE_TEST_HIT_STATS
+    if (cpi->pass != 1) {
+      double norm_per_pixel_mode_tests = 0;
+      double norm_counts[BLOCK_SIZES];
+      int i;
+      int sb64_per_frame;
+      int norm_factors[BLOCK_SIZES] =
+        {256, 128, 128, 64, 32, 32, 16, 8, 8, 4, 2, 2, 1};
+      FILE *f = fopen("mode_hit_stats.stt", "a");
+
+      // On average, how many mode tests do we do
+      for (i = 0; i < BLOCK_SIZES; ++i) {
+        norm_counts[i] = (double)cpi->mode_test_hits[i] /
+                         (double)norm_factors[i];
+        norm_per_pixel_mode_tests += norm_counts[i];
+      }
+      // Convert to a number per 64x64 and per frame
+      sb64_per_frame = ((cpi->common.height + 63) / 64) *
+                       ((cpi->common.width + 63) / 64);
+      norm_per_pixel_mode_tests =
+        norm_per_pixel_mode_tests /
+        (double)(cpi->common.current_video_frame * sb64_per_frame);
+
+      fprintf(f, "%6.4f\n", norm_per_pixel_mode_tests);
+      fclose(f);
+    }
+#endif
+
 #ifdef ENTROPY_STATS
     {
       int i, j, k;
@@ -1797,18 +1834,18 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
 
       fprintf(fmode, "\n#include \"vp9_entropymode.h\"\n\n");
       fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");
-      fprintf(fmode, "[VP9_INTRA_MODES][VP9_INTRA_MODES]"
-                     "[VP9_INTRA_MODES] =\n{\n");
+      fprintf(fmode, "[INTRA_MODES][INTRA_MODES]"
+                     "[INTRA_MODES] =\n{\n");
 
-      for (i = 0; i < VP9_INTRA_MODES; i++) {
+      for (i = 0; i < INTRA_MODES; i++) {
 
         fprintf(fmode, "    { // Above Mode :  %d\n", i);
 
-        for (j = 0; j < VP9_INTRA_MODES; j++) {
+        for (j = 0; j < INTRA_MODES; j++) {
 
           fprintf(fmode, "        {");
 
-          for (k = 0; k < VP9_INTRA_MODES; k++) {
+          for (k = 0; k < INTRA_MODES; k++) {
             if (!intra_mode_stats[i][j][k])
               fprintf(fmode, " %5d, ", 1);
             else
@@ -2214,6 +2251,12 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
         cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) {
       cpi->source_alt_ref_pending = 1;
       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+      // TODO(ivan): for SVC encoder, GF automatic update is disabled by using a
+      // large GF_interval
+      if (cpi->use_svc) {
+        cpi->frames_till_gf_update_due = INT_MAX;
+      }
     }
 
     if (!cpi->source_alt_ref_pending)
@@ -2379,7 +2422,8 @@ static void update_reference_frames(VP9_COMP * const cpi) {
   else if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&
       !cpi->refresh_alt_ref_frame) {
 #else
-  else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+  else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame &&
+           !cpi->use_svc) {
 #endif
     /* Preserve the previously existing golden frame and update the frame in
      * the alt ref slot instead. This is highly specific to the current use of
@@ -2424,7 +2468,7 @@ static void update_reference_frames(VP9_COMP * const cpi) {
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  struct loopfilter *lf = &xd->lf;
+  struct loopfilter *lf = &cm->lf;
   if (xd->lossless) {
       lf->filter_level = 0;
   } else {
@@ -2434,7 +2478,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
 
     vpx_usec_timer_start(&timer);
 
-    vp9_pick_filter_level(cpi->Source, cpi);
+    vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.use_fast_lpf_pick);
 
     vpx_usec_timer_mark(&timer);
     cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
@@ -2442,7 +2486,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
 
   if (lf->filter_level > 0) {
     vp9_set_alt_lf_level(cpi, lf->filter_level);
-    vp9_loop_filter_frame(cm, xd, lf->filter_level, 0);
+    vp9_loop_filter_frame(cm, xd, lf->filter_level, 0, 0);
   }
 
   vp9_extend_frame_inner_borders(cm->frame_to_show,
@@ -2452,9 +2496,11 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
 static void scale_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int i;
+  int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,
+                                      cpi->alt_fb_idx};
 
   for (i = 0; i < 3; i++) {
-    YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[i]];
+    YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[refs[i]]];
 
     if (ref->y_crop_width != cm->width ||
         ref->y_crop_height != cm->height) {
@@ -2467,8 +2513,8 @@ static void scale_references(VP9_COMP *cpi) {
       scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);
       cpi->scaled_ref_idx[i] = new_fb;
     } else {
-      cpi->scaled_ref_idx[i] = cm->ref_frame_map[i];
-      cm->fb_idx_ref_cnt[cm->ref_frame_map[i]]++;
+      cpi->scaled_ref_idx[i] = cm->ref_frame_map[refs[i]];
+      cm->fb_idx_ref_cnt[cm->ref_frame_map[refs[i]]]++;
     }
   }
 }
@@ -2532,25 +2578,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   SPEED_FEATURES *sf = &cpi->sf;
   unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
-  struct segmentation *seg = &xd->seg;
-#if RESET_FOREACH_FILTER
-  int q_low0;
-  int q_high0;
-  int Q0;
-  int active_best_quality0;
-  int active_worst_quality0;
-  double rate_correction_factor0;
-  double gf_rate_correction_factor0;
-#endif
-
-  /* list of filters to search over */
-  int mcomp_filters_to_search[] = {
-    EIGHTTAP, EIGHTTAP_SHARP, EIGHTTAP_SMOOTH, SWITCHABLE
-  };
-  int mcomp_filters = sizeof(mcomp_filters_to_search) /
-      sizeof(*mcomp_filters_to_search);
-  int mcomp_filter_index = 0;
-  int64_t mcomp_filter_cost[4];
+  struct segmentation *seg = &cm->seg;
 
   /* Scale the source buffer, if required */
   if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
@@ -2603,7 +2631,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   }
 
   // Set default state for segment based loop filter update flags
-  xd->lf.mode_ref_delta_update = 0;
+  cm->lf.mode_ref_delta_update = 0;
 
   // Initialize cpi->mv_step_param to default based on max resolution
   cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
@@ -2626,10 +2654,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   // Set various flags etc to special state if it is a key frame
   if (cm->frame_type == KEY_FRAME) {
-    int i;
-
     // Reset the loop filter deltas and segmentation map
-    setup_features(cpi);
+    setup_features(cm);
 
     // If segmentation is enabled force a map update for key frames
     if (seg->enabled) {
@@ -2640,10 +2666,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     // The alternate reference frame cannot be active for a key frame
     cpi->source_alt_ref_active = 0;
 
-    // Reset the RD threshold multipliers to default of * 1 (128)
-    for (i = 0; i < MAX_MODES; i++)
-      cpi->rd_thresh_mult[i] = 128;
-
     cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
     cm->frame_parallel_decoding_mode =
       (cpi->oxcf.frame_parallel_decoding_mode != 0);
@@ -2672,9 +2694,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   if (cm->frame_type == KEY_FRAME) {
 #if !CONFIG_MULTIPLE_ARF
-    // Special case for key frames forced because we have reached
-    // the maximum key frame interval. Here force the Q to a range
-    // based on the ambient Q to reduce the risk of popping
+      // Special case for key frames forced because we have reached
+      // the maximum key frame interval. Here force the Q to a range
+      // based on the ambient Q to reduce the risk of popping
     if (cpi->this_key_frame_forced) {
       int delta_qindex;
       int qindex = cpi->last_boosted_qindex;
@@ -2683,7 +2705,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       delta_qindex = compute_qdelta(cpi, last_boosted_q,
                                     (last_boosted_q * 0.75));
 
-      cpi->active_best_quality = MAX(qindex + delta_qindex, cpi->best_quality);
+      cpi->active_best_quality = MAX(qindex + delta_qindex,
+                                     cpi->best_quality);
     } else {
       int high = 5000;
       int low = 400;
@@ -2704,7 +2727,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
         cpi->active_best_quality = kf_low_motion_minq[q] + adjustment;
       }
 
-
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
         q_adj_factor -= 0.25;
@@ -2713,14 +2735,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       // Make a further adjustment based on the kf zero motion measure.
       q_adj_factor += 0.05 - (0.001 * (double)cpi->kf_zeromotion_pct);
 
-      // Convert the adjustment factor to a qindex delta on active_best_quality.
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
       q_val = vp9_convert_qindex_to_q(cpi->active_best_quality);
       cpi->active_best_quality +=
-        compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
+          compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
     }
 #else
     double current_q;
-
     // Force the KF quantizer to be 30% of the active_worst_quality.
     current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
     cpi->active_best_quality = cpi->active_worst_quality
@@ -2737,13 +2759,11 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
         cpi->avg_frame_qindex < cpi->active_worst_quality) {
       q = cpi->avg_frame_qindex;
     }
-
     // For constrained quality dont allow Q less than the cq level
     if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
         q < cpi->cq_target_quality) {
       q = cpi->cq_target_quality;
     }
-
     if (cpi->gfu_boost > high) {
       cpi->active_best_quality = gf_low_motion_minq[q];
     } else if (cpi->gfu_boost < low) {
@@ -2760,28 +2780,54 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     // Constrained quality use slightly lower active best.
     if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
       cpi->active_best_quality = cpi->active_best_quality * 15 / 16;
+
+    // TODO(debargha): Refine the logic below
+    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+      if (!cpi->refresh_alt_ref_frame) {
+        cpi->active_best_quality = cpi->cq_target_quality;
+      } else {
+        if (cpi->frames_since_key > 1) {
+          if (cpi->gfu_boost > high) {
+            cpi->active_best_quality = cpi->cq_target_quality * 6 / 16;
+          } else if (cpi->gfu_boost < low) {
+            cpi->active_best_quality = cpi->cq_target_quality * 11 / 16;
+          } else {
+            const int gap = high - low;
+            const int offset = high - cpi->gfu_boost;
+            const int qdiff = cpi->cq_target_quality * 5 / 16;
+            const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+            cpi->active_best_quality = cpi->cq_target_quality * 6 / 16
+                + adjustment;
+          }
+        }
+      }
+    }
   } else {
+    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+      cpi->active_best_quality = cpi->cq_target_quality;
+    } else {
 #ifdef ONE_SHOT_Q_ESTIMATE
 #ifdef STRICT_ONE_SHOT_Q
-    cpi->active_best_quality = q;
+      cpi->active_best_quality = q;
 #else
-    cpi->active_best_quality = inter_minq[q];
+      cpi->active_best_quality = inter_minq[q];
 #endif
 #else
-    cpi->active_best_quality = inter_minq[q];
+      cpi->active_best_quality = inter_minq[q];
 #endif
 
-    // For the constant/constrained quality mode we don't want
-    // q to fall below the cq level.
-    if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-        (cpi->active_best_quality < cpi->cq_target_quality)) {
-      // If we are strongly undershooting the target rate in the last
-      // frames then use the user passed in cq value not the auto
-      // cq value.
-      if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)
-        cpi->active_best_quality = cpi->oxcf.cq_level;
-      else
-        cpi->active_best_quality = cpi->cq_target_quality;
+      // For the constant/constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+          (cpi->active_best_quality < cpi->cq_target_quality)) {
+        // If we are strongly undershooting the target rate in the last
+        // frames then use the user passed in cq value not the auto
+        // cq value.
+        if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)
+          cpi->active_best_quality = cpi->oxcf.cq_level;
+        else
+          cpi->active_best_quality = cpi->cq_target_quality;
+      }
     }
   }
 
@@ -2799,7 +2845,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     cpi->active_worst_quality = cpi->active_best_quality;
 
   // Special case code to try and match quality with forced key frames
-  if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
+  if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+    q = cpi->active_best_quality;
+  } else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
     q = cpi->last_boosted_qindex;
   } else {
     // Determine initial Q to try
@@ -2811,7 +2859,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
 #if CONFIG_MULTIPLE_ARF
   // Force the quantizer determined by the coding order pattern.
-  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME)) {
+  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
+      cpi->oxcf.end_usage != USAGE_CONSTANT_QUALITY) {
     double new_q;
     double current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
     int level = cpi->this_frame_weight;
@@ -2841,19 +2890,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   vp9_zero(cpi->rd_tx_select_threshes);
 
   if (cm->frame_type != KEY_FRAME) {
-    /* TODO: Decide this more intelligently */
-    if (sf->search_best_filter) {
-      cm->mcomp_filter_type = mcomp_filters_to_search[0];
-      mcomp_filter_index = 0;
-    } else {
-      cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
-    }
+    cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
     /* TODO: Decide this more intelligently */
     xd->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH;
     set_mvcost(&cpi->mb);
   }
 
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
 
   if (cpi->oxcf.noise_sensitivity > 0) {
     int l = 0;
@@ -2886,17 +2929,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   vp9_write_yuv_frame(cpi->Source);
 #endif
 
-#if RESET_FOREACH_FILTER
-  if (sf->search_best_filter) {
-    q_low0 = q_low;
-    q_high0 = q_high;
-    Q0 = Q;
-    rate_correction_factor0 = cpi->rate_correction_factor;
-    gf_rate_correction_factor0 = cpi->gf_rate_correction_factor;
-    active_best_quality0 = cpi->active_best_quality;
-    active_worst_quality0 = cpi->active_worst_quality;
-  }
-#endif
   do {
     vp9_clear_system_state();  // __asm emms;
 
@@ -2946,178 +2978,135 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     active_worst_qchanged = 0;
 
     // Special case handling for forced key frames
-    if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
-      int last_q = q;
-      int kf_err = vp9_calc_ss_err(cpi->Source,
-                                   &cm->yv12_fb[cm->new_fb_idx]);
-
-      int high_err_target = cpi->ambient_err;
-      int low_err_target = cpi->ambient_err >> 1;
-
-      // Prevent possible divide by zero error below for perfect KF
-      kf_err += !kf_err;
-
-      // The key frame is not good enough or we can afford
-      // to make it better without undue risk of popping.
-      if ((kf_err > high_err_target &&
-           cpi->projected_frame_size <= frame_over_shoot_limit) ||
-          (kf_err > low_err_target &&
-           cpi->projected_frame_size <= frame_under_shoot_limit)) {
-        // Lower q_high
-        q_high = q > q_low ? q - 1 : q_low;
-
-        // Adjust Q
-        q = (q * high_err_target) / kf_err;
-        q = MIN(q, (q_high + q_low) >> 1);
-      } else if (kf_err < low_err_target &&
-                cpi->projected_frame_size >= frame_under_shoot_limit) {
-        // The key frame is much better than the previous frame
-        // Raise q_low
-        q_low = q < q_high ? q + 1 : q_high;
-
-        // Adjust Q
-        q = (q * low_err_target) / kf_err;
-        q = MIN(q, (q_high + q_low + 1) >> 1);
-      }
-
-      // Clamp Q to upper and lower limits:
-      q = clamp(q, q_low, q_high);
-
-      loop = q != last_q;
-    }
-
-    // Is the projected frame size out of range and are we allowed to attempt to recode.
-    else if (recode_loop_test(cpi,
-                              frame_over_shoot_limit, frame_under_shoot_limit,
-                              q, top_index, bottom_index)) {
-      int last_q = q;
-      int retries = 0;
+    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+      loop = 0;
+    } else {
+      if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
+        int last_q = q;
+        int kf_err = vp9_calc_ss_err(cpi->Source,
+                                     &cm->yv12_fb[cm->new_fb_idx]);
+
+        int high_err_target = cpi->ambient_err;
+        int low_err_target = cpi->ambient_err >> 1;
+
+        // Prevent possible divide by zero error below for perfect KF
+        kf_err += !kf_err;
+
+        // The key frame is not good enough or we can afford
+        // to make it better without undue risk of popping.
+        if ((kf_err > high_err_target &&
+             cpi->projected_frame_size <= frame_over_shoot_limit) ||
+            (kf_err > low_err_target &&
+             cpi->projected_frame_size <= frame_under_shoot_limit)) {
+          // Lower q_high
+          q_high = q > q_low ? q - 1 : q_low;
+
+          // Adjust Q
+          q = (q * high_err_target) / kf_err;
+          q = MIN(q, (q_high + q_low) >> 1);
+        } else if (kf_err < low_err_target &&
+                   cpi->projected_frame_size >= frame_under_shoot_limit) {
+          // The key frame is much better than the previous frame
+          // Raise q_low
+          q_low = q < q_high ? q + 1 : q_high;
+
+          // Adjust Q
+          q = (q * low_err_target) / kf_err;
+          q = MIN(q, (q_high + q_low + 1) >> 1);
+        }
 
-      // Frame size out of permitted range:
-      // Update correction factor & compute new Q to try...
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
+
+        loop = q != last_q;
+      } else if (recode_loop_test(
+          cpi, frame_over_shoot_limit, frame_under_shoot_limit,
+          q, top_index, bottom_index)) {
+        // Is the projected frame size out of range and are we allowed
+        // to attempt to recode.
+        int last_q = q;
+        int retries = 0;
+
+        // Frame size out of permitted range:
+        // Update correction factor & compute new Q to try...
+
+        // Frame is too large
+        if (cpi->projected_frame_size > cpi->this_frame_target) {
+          // Raise Qlow as to at least the current value
+          q_low = q < q_high ? q + 1 : q_high;
+
+          if (undershoot_seen || loop_count > 1) {
+            // Update rate_correction_factor unless
+            // cpi->active_worst_quality has changed.
+            if (!active_worst_qchanged)
+              vp9_update_rate_correction_factors(cpi, 1);
+
+            q = (q_high + q_low + 1) / 2;
+          } else {
+            // Update rate_correction_factor unless
+            // cpi->active_worst_quality has changed.
+            if (!active_worst_qchanged)
+              vp9_update_rate_correction_factors(cpi, 0);
 
-      // Frame is too large
-      if (cpi->projected_frame_size > cpi->this_frame_target) {
-        // Raise Qlow as to at least the current value
-        q_low = q < q_high ? q + 1 : q_high;
+            q = vp9_regulate_q(cpi, cpi->this_frame_target);
 
-        if (undershoot_seen || loop_count > 1) {
-          // Update rate_correction_factor unless cpi->active_worst_quality
-          // has changed.
-          if (!active_worst_qchanged)
-            vp9_update_rate_correction_factors(cpi, 1);
+            while (q < q_low && retries < 10) {
+              vp9_update_rate_correction_factors(cpi, 0);
+              q = vp9_regulate_q(cpi, cpi->this_frame_target);
+              retries++;
+            }
+          }
 
-          q = (q_high + q_low + 1) / 2;
+          overshoot_seen = 1;
         } else {
-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
-          if (!active_worst_qchanged)
-            vp9_update_rate_correction_factors(cpi, 0);
-
-          q = vp9_regulate_q(cpi, cpi->this_frame_target);
+          // Frame is too small
+          q_high = q > q_low ? q - 1 : q_low;
+
+          if (overshoot_seen || loop_count > 1) {
+            // Update rate_correction_factor unless
+            // cpi->active_worst_quality has changed.
+            if (!active_worst_qchanged)
+              vp9_update_rate_correction_factors(cpi, 1);
+
+            q = (q_high + q_low) / 2;
+          } else {
+            // Update rate_correction_factor unless
+            // cpi->active_worst_quality has changed.
+            if (!active_worst_qchanged)
+              vp9_update_rate_correction_factors(cpi, 0);
 
-          while (q < q_low && retries < 10) {
-            vp9_update_rate_correction_factors(cpi, 0);
             q = vp9_regulate_q(cpi, cpi->this_frame_target);
-            retries++;
-          }
-        }
 
-        overshoot_seen = 1;
-      } else {
-        // Frame is too small
-        q_high = q > q_low ? q - 1 : q_low;
-
-        if (overshoot_seen || loop_count > 1) {
-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
-          if (!active_worst_qchanged)
-            vp9_update_rate_correction_factors(cpi, 1);
+            // Special case reset for qlow for constrained quality.
+            // This should only trigger where there is very substantial
+            // undershoot on a frame and the auto cq level is above
+            // the user passsed in value.
+            if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) {
+              q_low = q;
+            }
 
-          q = (q_high + q_low) / 2;
-        } else {
-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
-          if (!active_worst_qchanged)
-            vp9_update_rate_correction_factors(cpi, 0);
-
-          q = vp9_regulate_q(cpi, cpi->this_frame_target);
-
-          // Special case reset for qlow for constrained quality.
-          // This should only trigger where there is very substantial
-          // undershoot on a frame and the auto cq level is above
-          // the user passsed in value.
-          if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) {
-            q_low = q;
+            while (q > q_high && retries < 10) {
+              vp9_update_rate_correction_factors(cpi, 0);
+              q = vp9_regulate_q(cpi, cpi->this_frame_target);
+              retries++;
+            }
           }
 
-          while (q > q_high && retries < 10) {
-            vp9_update_rate_correction_factors(cpi, 0);
-            q = vp9_regulate_q(cpi, cpi->this_frame_target);
-            retries++;
-          }
+          undershoot_seen = 1;
         }
 
-        undershoot_seen = 1;
-      }
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
 
-      // Clamp Q to upper and lower limits:
-      q = clamp(q, q_low, q_high);
-
-      loop = q != last_q;
-    } else {
-      loop = 0;
+        loop = q != last_q;
+      } else {
+        loop = 0;
+      }
     }
 
     if (cpi->is_src_frame_alt_ref)
       loop = 0;
 
-    if (!loop && cm->frame_type != KEY_FRAME && sf->search_best_filter) {
-      if (mcomp_filter_index < mcomp_filters) {
-        int64_t err = vp9_calc_ss_err(cpi->Source,
-                                    &cm->yv12_fb[cm->new_fb_idx]);
-        int64_t rate = cpi->projected_frame_size << 8;
-        mcomp_filter_cost[mcomp_filter_index] =
-          (RDCOST(cpi->RDMULT, cpi->RDDIV, rate, err));
-        mcomp_filter_index++;
-        if (mcomp_filter_index < mcomp_filters) {
-          cm->mcomp_filter_type = mcomp_filters_to_search[mcomp_filter_index];
-          loop_count = -1;
-          loop = 1;
-        } else {
-          int f;
-          int64_t best_cost = mcomp_filter_cost[0];
-          int mcomp_best_filter = mcomp_filters_to_search[0];
-          for (f = 1; f < mcomp_filters; f++) {
-            if (mcomp_filter_cost[f] < best_cost) {
-              mcomp_best_filter = mcomp_filters_to_search[f];
-              best_cost = mcomp_filter_cost[f];
-            }
-          }
-          if (mcomp_best_filter != mcomp_filters_to_search[mcomp_filters - 1]) {
-            loop_count = -1;
-            loop = 1;
-            cm->mcomp_filter_type = mcomp_best_filter;
-          }
-          /*
-          printf("  best filter = %d, ( ", mcomp_best_filter);
-          for (f=0;f<mcomp_filters; f++) printf("%d ",  mcomp_filter_cost[f]);
-          printf(")\n");
-          */
-        }
-#if RESET_FOREACH_FILTER
-        if (loop) {
-          overshoot_seen = 0;
-          undershoot_seen = 0;
-          q_low = q_low0;
-          q_high = q_high0;
-          q = Q0;
-          cpi->rate_correction_factor = rate_correction_factor0;
-          cpi->gf_rate_correction_factor = gf_rate_correction_factor0;
-          cpi->active_best_quality = active_best_quality0;
-          cpi->active_worst_quality = active_worst_quality0;
-        }
-#endif
-      }
-    }
-
     if (loop) {
       loop_count++;
 
@@ -3165,7 +3154,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   cpi->dummy_packing = 0;
   vp9_pack_bitstream(cpi, dest, size);
 
-  if (xd->seg.update_map)
+  if (cm->seg.update_map)
     update_reference_segmentation_map(cpi);
 
   release_scaled_references(cpi);
@@ -3296,9 +3285,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // in this frame.
   // update_base_skip_probs(cpi);
 
-#if 0 && CONFIG_INTERNAL_STATS
+#if CONFIG_INTERNAL_STATS
   {
-    FILE *f = fopen("tmp.stt", "a");
+    FILE *f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
     int recon_err;
 
     vp9_clear_system_state();  // __asm emms;
@@ -3307,7 +3296,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
                                 &cm->yv12_fb[cm->new_fb_idx]);
 
     if (cpi->twopass.total_left_stats.coded_error != 0.0)
-      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
+      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
               "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
               "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
               "%10.3f %8d %10d %10d %10d\n",
@@ -3317,6 +3306,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
               (int)cpi->total_target_vs_actual,
               (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
               (int)cpi->total_actual_bits,
+              cm->base_qindex,
               vp9_convert_qindex_to_q(cm->base_qindex),
               (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
               vp9_convert_qindex_to_q(cpi->active_best_quality),
@@ -3335,7 +3325,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
               cpi->tot_recode_hits, recon_err, cpi->kf_boost,
               cpi->kf_zeromotion_pct);
     else
-      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
+      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
               "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
               "%5d %5d %5d %8d %8d %8.2f %10d %10.3f"
               "%8d %10d %10d %10d\n",
@@ -3346,6 +3336,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
               (int)cpi->total_target_vs_actual,
               (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
               (int)cpi->total_actual_bits,
+              cm->base_qindex,
               vp9_convert_qindex_to_q(cm->base_qindex),
               (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
               vp9_convert_qindex_to_q(cpi->active_best_quality),
@@ -3473,9 +3464,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   }
 
   // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
-  xd->seg.update_map = 0;
-  xd->seg.update_data = 0;
-  xd->lf.mode_ref_delta_update = 0;
+  cm->seg.update_map = 0;
+  cm->seg.update_data = 0;
+  cm->lf.mode_ref_delta_update = 0;
 
   // keep track of the last coded dimensions
   cm->last_width = cm->width;
@@ -3486,11 +3477,15 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   if (cm->show_frame) {
     // current mip will be the prev_mip for the next frame
     MODE_INFO *temp = cm->prev_mip;
+    MODE_INFO **temp2 = cm->prev_mi_grid_base;
     cm->prev_mip = cm->mip;
     cm->mip = temp;
+    cm->prev_mi_grid_base = cm->mi_grid_base;
+    cm->mi_grid_base = temp2;
 
     // update the upper left visible macroblock ptrs
     cm->mi = cm->mip + cm->mode_info_stride + 1;
+    cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1;
 
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
@@ -3499,8 +3494,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   }
   // restore prev_mi
   cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
 
-#if 0
+  #if 0
   {
     char filename[512];
     FILE *recon_file;
@@ -3521,6 +3517,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
                         unsigned char *dest, unsigned int *frame_flags) {
 
+  cpi->enable_encode_breakout = 1;
+
   if (!cpi->refresh_alt_ref_frame)
     vp9_second_pass(cpi);
 
@@ -3544,24 +3542,28 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
   }
 }
 
-
-int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
-                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
-                          int64_t end_time) {
-  VP9_COMP              *cpi = (VP9_COMP *) ptr;
+static void check_initial_width(VP9_COMP *cpi, YV12_BUFFER_CONFIG *sd) {
   VP9_COMMON            *cm = &cpi->common;
-  struct vpx_usec_timer  timer;
-  int                    res = 0;
-
   if (!cpi->initial_width) {
     // TODO(jkoleszar): Support 1/4 subsampling?
-    cm->subsampling_x = sd->uv_width < sd->y_width;
-    cm->subsampling_y = sd->uv_height < sd->y_height;
+    cm->subsampling_x = (sd != NULL) && sd->uv_width < sd->y_width;
+    cm->subsampling_y = (sd != NULL) && sd->uv_height < sd->y_height;
     alloc_raw_frame_buffers(cpi);
 
     cpi->initial_width = cm->width;
     cpi->initial_height = cm->height;
   }
+}
+
+
+int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time) {
+  VP9_COMP              *cpi = (VP9_COMP *) ptr;
+  struct vpx_usec_timer  timer;
+  int                    res = 0;
+
+  check_initial_width(cpi, sd);
   vpx_usec_timer_start(&timer);
   if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
                          cpi->active_map_enabled ? cpi->active_map : NULL))
@@ -3575,16 +3577,15 @@ int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
 
 static int frame_is_reference(const VP9_COMP *cpi) {
   const VP9_COMMON *cm = &cpi->common;
-  const MACROBLOCKD *mb = &cpi->mb.e_mbd;
 
   return cm->frame_type == KEY_FRAME ||
          cpi->refresh_last_frame ||
          cpi->refresh_golden_frame ||
          cpi->refresh_alt_ref_frame ||
          cm->refresh_frame_context ||
-         mb->lf.mode_ref_delta_update ||
-         mb->seg.update_map ||
-         mb->seg.update_data;
+         cm->lf.mode_ref_delta_update ||
+         cm->seg.update_map ||
+         cm->seg.update_data;
 }
 
 #if CONFIG_MULTIPLE_ARF
@@ -3644,6 +3645,8 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
         configure_arnr_filter(cpi, cm->current_video_frame + frames_to_arf,
                               cpi->gfu_boost);
         vp9_temporal_filter_prepare(cpi, frames_to_arf);
+        vp9_extend_frame_borders(&cpi->alt_ref_buffer,
+                                 cm->subsampling_x, cm->subsampling_y);
         force_src_buffer = &cpi->alt_ref_buffer;
       }
 
@@ -3911,9 +3914,9 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
         {
           double frame_psnr2, frame_ssim2 = 0;
           double weight = 0;
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
           vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
-                      cpi->mb.e_mbd.lf.filter_level * 10 / 6);
+                      cm->lf.filter_level * 10 / 6);
 #endif
           vp9_clear_system_state();
 
@@ -3987,8 +3990,8 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
     return -1;
   else {
     int ret;
-#if CONFIG_POSTPROC
-    ret = vp9_post_proc_frame(&cpi->common, &cpi->mb.e_mbd.lf, dest, flags);
+#if CONFIG_VP9_POSTPROC
+    ret = vp9_post_proc_frame(&cpi->common, dest, flags);
 #else
 
     if (cpi->common.frame_to_show) {
@@ -4001,7 +4004,7 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
       ret = -1;
     }
 
-#endif // !CONFIG_POSTPROC
+#endif  // !CONFIG_VP9_POSTPROC
     vp9_clear_system_state();
     return ret;
   }
@@ -4013,7 +4016,7 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
                    unsigned int threshold[MAX_SEGMENTS]) {
   VP9_COMP *cpi = (VP9_COMP *) comp;
   signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS];
-  struct segmentation *seg = &cpi->mb.e_mbd.seg;
+  struct segmentation *seg = &cpi->common.seg;
   int i;
 
   if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
@@ -4030,7 +4033,7 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
   // Activate segmentation.
   vp9_enable_segmentation((VP9_PTR)cpi);
 
-  // Set up the quan, LF and breakout threshold segment data
+  // Set up the quant, LF and breakout threshold segment data
   for (i = 0; i < MAX_SEGMENTS; i++) {
     feature_data[SEG_LVL_ALT_Q][i] = delta_q[i];
     feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i];
@@ -4050,7 +4053,7 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
       vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF);
   }
 
-  // Initialise the feature data structure
+  // Initialize the feature data structure
   // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
   vp9_set_segment_data((VP9_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 
@@ -4098,7 +4101,76 @@ int vp9_set_internal_size(VP9_PTR comp,
   return 0;
 }
 
+int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
+                         unsigned int height) {
+  VP9_COMP *cpi = (VP9_COMP *)comp;
+  VP9_COMMON *cm = &cpi->common;
+
+  check_initial_width(cpi, NULL);
+
+  if (width) {
+    cm->width = width;
+    if (cm->width * 5 < cpi->initial_width) {
+      cm->width = cpi->initial_width / 5 + 1;
+      printf("Warning: Desired width too small, changed to %d \n", cm->width);
+    }
+    if (cm->width > cpi->initial_width) {
+      cm->width = cpi->initial_width;
+      printf("Warning: Desired width too large, changed to %d \n", cm->width);
+    }
+  }
+
+  if (height) {
+    cm->height = height;
+    if (cm->height * 5 < cpi->initial_height) {
+      cm->height = cpi->initial_height / 5 + 1;
+      printf("Warning: Desired height too small, changed to %d \n", cm->height);
+    }
+    if (cm->height > cpi->initial_height) {
+      cm->height = cpi->initial_height;
+      printf("Warning: Desired height too large, changed to %d \n", cm->height);
+    }
+  }
+
+  assert(cm->width <= cpi->initial_width);
+  assert(cm->height <= cpi->initial_height);
+  update_frame_size(cpi);
+  return 0;
+}
+
+int vp9_switch_layer(VP9_PTR comp, int layer) {
+  VP9_COMP *cpi = (VP9_COMP *)comp;
+
+  if (cpi->use_svc) {
+    cpi->current_layer = layer;
+
+    // Use buffer i for layer i LST
+    cpi->lst_fb_idx = layer;
 
+    // Use buffer i-1 for layer i Alt (Inter-layer prediction)
+    if (layer != 0) cpi->alt_fb_idx = layer - 1;
+
+    // Use the rest for Golden
+    if (layer < 2 * cpi->number_spatial_layers - NUM_REF_FRAMES)
+      cpi->gld_fb_idx = cpi->lst_fb_idx;
+    else
+      cpi->gld_fb_idx = 2 * cpi->number_spatial_layers - 1 - layer;
+
+    printf("Switching to layer %d:\n", layer);
+    printf("Using references: LST/GLD/ALT [%d|%d|%d]\n", cpi->lst_fb_idx,
+           cpi->gld_fb_idx, cpi->alt_fb_idx);
+  } else {
+    printf("Switching layer not supported. Enable SVC first \n");
+  }
+  return 0;
+}
+
+void vp9_set_svc(VP9_PTR comp, int use_svc) {
+  VP9_COMP *cpi = (VP9_COMP *)comp;
+  cpi->use_svc = use_svc;
+  if (cpi->use_svc) printf("Enabled SVC encoder \n");
+  return;
+}
 
 int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) {
   int i, j;
diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h
index c258829..3e5796f 100644
--- a/libvpx/vp9/encoder/vp9_onyx_int.h
+++ b/libvpx/vp9/encoder/vp9_onyx_int.h
@@ -36,6 +36,8 @@
 #define DISABLE_RC_LONG_TERM_MEM 0
 #endif
 
+// #define MODE_TEST_HIT_STATS
+
 // #define SPEEDSTATS 1
 #if CONFIG_MULTIPLE_ARF
 // Set MIN_GF_INTERVAL to 1 for the full decomposition.
@@ -79,15 +81,15 @@ typedef struct {
 
   vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
 
-  vp9_prob y_mode_prob[4][VP9_INTRA_MODES - 1];
-  vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
+  vp9_prob y_mode_prob[4][INTRA_MODES - 1];
+  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
   vp9_prob partition_prob[2][NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
 
-  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
-                                 [VP9_SWITCHABLE_FILTERS - 1];
+  vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
+                                 [SWITCHABLE_FILTERS - 1];
 
-  int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
-  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
+  int inter_mode_counts[INTER_MODE_CONTEXTS][INTER_MODES - 1][2];
+  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
 
   struct tx_probs tx_probs;
   vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
@@ -145,18 +147,19 @@ typedef struct {
 // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
   THR_NEARESTMV,
-  THR_DC,
-
   THR_NEARESTA,
   THR_NEARESTG,
-  THR_NEWMV,
-  THR_COMP_NEARESTLA,
-  THR_NEARMV,
-  THR_COMP_NEARESTGA,
 
-  THR_NEWG,
+  THR_DC,
+
+  THR_NEWMV,
   THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
   THR_NEARA,
+  THR_COMP_NEARESTLA,
+  THR_COMP_NEARESTGA,
 
   THR_TM,
 
@@ -182,7 +185,7 @@ typedef enum {
   THR_H_PRED,
   THR_V_PRED,
   THR_D135_PRED,
-  THR_D27_PRED,
+  THR_D207_PRED,
   THR_D153_PRED,
   THR_D63_PRED,
   THR_D117_PRED,
@@ -192,7 +195,9 @@ typedef enum {
 typedef enum {
   DIAMOND = 0,
   NSTEP = 1,
-  HEX = 2
+  HEX = 2,
+  BIGDIA = 3,
+  SQUARE = 4
 } SEARCH_METHODS;
 
 typedef enum {
@@ -230,20 +235,29 @@ typedef enum {
   FLAG_SKIP_INTRA_LOWVAR = 32,
 } MODE_SEARCH_SKIP_LOGIC;
 
+typedef enum {
+  SUBPEL_ITERATIVE = 0,
+  SUBPEL_TREE = 1,
+  // Other methods to come
+} SUBPEL_SEARCH_METHODS;
+
+#define ALL_INTRA_MODES 0x3FF
+#define INTRA_DC_ONLY 0x01
+#define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED))
+#define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED))
+
 typedef struct {
   int RD;
   SEARCH_METHODS search_method;
   int auto_filter;
   int recode_loop;
-  int iterative_sub_pixel;
-  int half_pixel_search;
-  int quarter_pixel_search;
+  SUBPEL_SEARCH_METHODS subpel_search_method;
+  int subpel_iters_per_step;
   int thresh_mult[MAX_MODES];
   int max_step_search_steps;
   int reduce_first_step_size;
   int auto_mv_step_size;
   int optimize_coefficients;
-  int search_best_filter;
   int static_segmentation;
   int comp_inter_joint_search_thresh;
   int adaptive_rd_thresh;
@@ -251,36 +265,43 @@ typedef struct {
   int skip_encode_frame;
   int use_lastframe_partitioning;
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
-  int use_8tap_always;
+  int use_lp32x32fdct;
   int use_avoid_tested_higherror;
   int skip_lots_of_modes;
-  int adjust_thresholds_by_speed;
   int partition_by_variance;
   int use_one_partition_size_always;
   int less_rectangular_check;
   int use_square_partition_only;
-  int unused_mode_skip_lvl;
+  int mode_skip_start;
   int reference_masking;
-  BLOCK_SIZE_TYPE always_this_block_size;
+  BLOCK_SIZE always_this_block_size;
   int auto_min_max_partition_size;
   int auto_min_max_partition_interval;
   int auto_min_max_partition_count;
-  BLOCK_SIZE_TYPE min_partition_size;
-  BLOCK_SIZE_TYPE max_partition_size;
-  // int use_min_partition_size;       // not used in code
-  // int use_max_partition_size;
+  BLOCK_SIZE min_partition_size;
+  BLOCK_SIZE max_partition_size;
   int adjust_partitioning_from_last_frame;
   int last_partitioning_redo_frequency;
   int disable_splitmv;
   int using_small_partition_info;
+  // TODO(jingning): combine the related motion search speed features
+  int adaptive_motion_search;
 
   // Implements various heuristics to skip searching modes
   // The heuristics selected are based on  flags
   // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
   unsigned int mode_search_skip_flags;
-  MB_PREDICTION_MODE last_chroma_intra_mode;
+  // A source variance threshold below which the split mode is disabled
+  unsigned int disable_split_var_thresh;
+  // A source variance threshold below which filter search is disabled
+  // Choose a very large value (UINT_MAX) to use 8-tap always
+  unsigned int disable_filter_search_var_thresh;
+  int intra_y_mode_mask;
+  int intra_uv_mode_mask;
   int use_rd_breakout;
   int use_uv_intra_rd_estimate;
+  int use_fast_lpf_pick;
+  int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
 } SPEED_FEATURES;
 
 typedef struct VP9_COMP {
@@ -331,6 +352,10 @@ typedef struct VP9_COMP {
   int lst_fb_idx;
   int gld_fb_idx;
   int alt_fb_idx;
+
+  int current_layer;
+  int use_svc;
+
 #if CONFIG_MULTIPLE_ARF
   int alt_ref_fb_idx[NUM_REF_FRAMES - 3];
 #endif
@@ -360,14 +385,12 @@ typedef struct VP9_COMP {
   unsigned int mode_check_freq[MAX_MODES];
   unsigned int mode_test_hit_counts[MAX_MODES];
   unsigned int mode_chosen_counts[MAX_MODES];
-  int64_t unused_mode_skip_mask;
+  int64_t mode_skip_mask;
   int ref_frame_mask;
   int set_ref_frame_mask;
 
-  int rd_thresh_mult[MAX_MODES];
-  int rd_baseline_thresh[BLOCK_SIZE_TYPES][MAX_MODES];
-  int rd_threshes[BLOCK_SIZE_TYPES][MAX_MODES];
-  int rd_thresh_freq_fact[BLOCK_SIZE_TYPES][MAX_MODES];
+  int rd_threshes[BLOCK_SIZES][MAX_MODES];
+  int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
 
   int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
   // FIXME(rbultje) int64_t?
@@ -381,9 +404,9 @@ typedef struct VP9_COMP {
   // FIXME(rbultje) can this overflow?
   int rd_tx_select_threshes[4][TX_MODES];
 
-  int64_t rd_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
-  int64_t rd_filter_threshes[4][VP9_SWITCHABLE_FILTERS + 1];
-  int64_t rd_filter_cache[VP9_SWITCHABLE_FILTERS + 1];
+  int64_t rd_filter_diff[SWITCHABLE_FILTERS + 1];
+  int64_t rd_filter_threshes[4][SWITCHABLE_FILTERS + 1];
+  int64_t rd_filter_cache[SWITCHABLE_FILTERS + 1];
 
   int RDMULT;
   int RDDIV;
@@ -458,8 +481,8 @@ typedef struct VP9_COMP {
 
   int cq_target_quality;
 
-  int y_mode_count[4][VP9_INTRA_MODES];
-  int y_uv_mode_count[VP9_INTRA_MODES][VP9_INTRA_MODES];
+  int y_mode_count[4][INTRA_MODES];
+  int y_uv_mode_count[INTRA_MODES][INTRA_MODES];
   unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
   nmv_context_counts NMVcount;
@@ -472,6 +495,7 @@ typedef struct VP9_COMP {
   int last_boost;
   int kf_boost;
   int kf_zeromotion_pct;
+  int gf_zeromotion_pct;
 
   int64_t target_bandwidth;
   struct vpx_codec_pkt_list  *output_pkt_list;
@@ -527,10 +551,11 @@ typedef struct VP9_COMP {
   unsigned int active_map_enabled;
 
   fractional_mv_step_fp *find_fractional_mv_step;
+  fractional_mv_step_comp_fp *find_fractional_mv_step_comp;
   vp9_full_search_fn_t full_search_sad;
   vp9_refining_search_fn_t refining_search_sad;
   vp9_diamond_search_fn_t diamond_search_sad;
-  vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZE_TYPES];
+  vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
   uint64_t time_receive_data;
   uint64_t time_compress_data;
   uint64_t time_pick_lpf;
@@ -623,14 +648,18 @@ typedef struct VP9_COMP {
 
   int dummy_packing;    /* flag to indicate if packing is dummy */
 
-  unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
-                                      [VP9_SWITCHABLE_FILTERS];
+  unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1]
+                                      [SWITCHABLE_FILTERS];
 
   unsigned int txfm_stepdown_count[TX_SIZES];
 
   int initial_width;
   int initial_height;
 
+  int number_spatial_layers;
+  int enable_encode_breakout;   // Default value is 1. From first pass stats,
+                                // encode_breakout may be disabled.
+
 #if CONFIG_MULTIPLE_ARF
   // ARF tracking variables.
   int multi_arf_enabled;
@@ -645,7 +674,13 @@ typedef struct VP9_COMP {
 #endif
 
 #ifdef ENTROPY_STATS
-  int64_t mv_ref_stats[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
+  int64_t mv_ref_stats[INTER_MODE_CONTEXTS][INTER_MODES - 1][2];
+#endif
+
+
+#ifdef MODE_TEST_HIT_STATS
+  // Debug / test stats
+  int64_t mode_test_hits[BLOCK_SIZES];
 #endif
 } VP9_COMP;
 
@@ -659,6 +694,17 @@ static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
   }
 }
 
+static int get_scale_ref_frame_idx(VP9_COMP *cpi,
+                                   MV_REFERENCE_FRAME ref_frame) {
+  if (ref_frame == LAST_FRAME) {
+    return 0;
+  } else if (ref_frame == GOLDEN_FRAME) {
+    return 1;
+  } else {
+    return 2;
+  }
+}
+
 void vp9_encode_frame(VP9_COMP *cpi);
 
 void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c
index 2b8f2cd..239fd6b 100644
--- a/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/libvpx/vp9/encoder/vp9_picklpf.c
@@ -21,29 +21,15 @@
 #include "./vpx_scale_rtcd.h"
 
 void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
-                                   YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
-  uint8_t *src_y, *dst_y;
-  int yheight;
-  int ystride;
-  int yoffset;
-  int linestocopy;
+                                   YV12_BUFFER_CONFIG *dst_ybc, int fraction) {
+  const int height = src_ybc->y_height;
+  const int stride = src_ybc->y_stride;
+  const int offset = stride * ((height >> 5) * 16 - 8);
+  const int lines_to_copy = MAX(height >> (fraction + 4), 1) << 4;
 
   assert(src_ybc->y_stride == dst_ybc->y_stride);
-  yheight  = src_ybc->y_height;
-  ystride  = src_ybc->y_stride;
-
-  linestocopy = (yheight >> (Fraction + 4));
-
-  if (linestocopy < 1)
-    linestocopy = 1;
-
-  linestocopy <<= 4;
-
-  yoffset  = ystride * ((yheight >> 5) * 16 - 8);
-  src_y = src_ybc->y_buffer + yoffset;
-  dst_y = dst_ybc->y_buffer + yoffset;
-
-  vpx_memcpy(dst_y, src_y, ystride * (linestocopy + 16));
+  vpx_memcpy(dst_ybc->y_buffer + offset, src_ybc->y_buffer + offset,
+             stride * (lines_to_copy + 16));
 }
 
 static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
@@ -125,14 +111,14 @@ static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {
 void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {
 }
 
-void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  struct loopfilter *lf = &cpi->mb.e_mbd.lf;
+void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
+  VP9_COMMON *const cm = &cpi->common;
+  struct loopfilter *const lf = &cm->lf;
 
   int best_err = 0;
   int filt_err = 0;
-  int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
-  int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+  const int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+  const int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
 
   int filter_step;
   int filt_high = 0;
@@ -145,33 +131,26 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
   int Bias = 0;                       // Bias against raising loop filter and in favour of lowering it
 
   //  Make a copy of the unfiltered / processed recon buffer
-  vp8_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+  vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
 
-  if (cm->frame_type == KEY_FRAME)
-    lf->sharpness_level = 0;
-  else
-    lf->sharpness_level = cpi->oxcf.Sharpness;
+  lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
+                                                    : cpi->oxcf.Sharpness;
 
   // Start the search at the previous frame filter level unless it is now out of range.
-  filt_mid = lf->filter_level;
-
-  if (filt_mid < min_filter_level)
-    filt_mid = min_filter_level;
-  else if (filt_mid > max_filter_level)
-    filt_mid = max_filter_level;
+  filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
 
   // Define the initial step size
-  filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
+  filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
 
   // Get baseline error score
   vp9_set_alt_lf_level(cpi, filt_mid);
-  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1);
+  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1, partial);
 
   best_err = vp9_calc_ss_err(sd, cm->frame_to_show);
   filt_best = filt_mid;
 
   //  Re-instate the unfiltered frame
-  vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+  vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
 
   while (filter_step > 0) {
     Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images
@@ -190,12 +169,12 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
     if ((filt_direction <= 0) && (filt_low != filt_mid)) {
       // Get Low filter error score
       vp9_set_alt_lf_level(cpi, filt_low);
-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1);
+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1, partial);
 
       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
 
       //  Re-instate the unfiltered frame
-      vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+      vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
 
       // If value is close to the best so far then bias towards a lower loop filter value.
       if ((filt_err - Bias) < best_err) {
@@ -210,12 +189,12 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
     // Now look at filt_high
     if ((filt_direction >= 0) && (filt_high != filt_mid)) {
       vp9_set_alt_lf_level(cpi, filt_high);
-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1);
+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, partial);
 
       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
 
       //  Re-instate the unfiltered frame
-      vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+      vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
 
       // Was it better than the previous best?
       if (filt_err < (best_err - Bias)) {
@@ -236,3 +215,4 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
 
   lf->filter_level = filt_best;
 }
+
diff --git a/libvpx/vp9/encoder/vp9_picklpf.h b/libvpx/vp9/encoder/vp9_picklpf.h
index 698cb8d..9de4cf8 100644
--- a/libvpx/vp9/encoder/vp9_picklpf.h
+++ b/libvpx/vp9/encoder/vp9_picklpf.h
@@ -18,6 +18,5 @@ struct VP9_COMP;
 void vp9_set_alt_lf_level(struct VP9_COMP *cpi, int filt_val);
 
 void vp9_pick_filter_level(struct yv12_buffer_config *sd,
-                           struct VP9_COMP *cpi);
-
+                           struct VP9_COMP *cpi, int partial);
 #endif  // VP9_ENCODER_VP9_PICKLPF_H_
diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c
index 525f4da..6c8b2a0 100644
--- a/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libvpx/vp9/encoder/vp9_quantize.c
@@ -69,6 +69,7 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
 
       if (x >= zbin) {
         x += (round_ptr[rc != 0]);
+        x  = clamp(x, INT16_MIN, INT16_MAX);
         y  = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
               quant_shift_ptr[rc != 0]) >> 16;      // quantize (x)
         x  = (y ^ sz) - sz;                         // get the sign back
@@ -84,7 +85,6 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
   *eob_ptr = eob + 1;
 }
 
-// This function works well for large transform size.
 void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
                             int skip_block,
                             int16_t *zbin_ptr, int16_t *round_ptr,
@@ -94,7 +94,7 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
                             uint16_t *eob_ptr, const int16_t *scan,
                             const int16_t *iscan) {
   int i, rc, eob;
-  int zbins[2], nzbins[2], zbin;
+  int zbins[2], nzbins[2];
   int x, y, z, sz;
   int idx = 0;
   int idx_arr[1024];
@@ -105,8 +105,8 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
   eob = -1;
 
   // Base ZBIN
-  zbins[0] = zbin_ptr[0] + zbin_oq_value;
-  zbins[1] = zbin_ptr[1] + zbin_oq_value;
+  zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);
+  zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);
   nzbins[0] = zbins[0] * -1;
   nzbins[1] = zbins[1] * -1;
 
@@ -114,7 +114,7 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
     // Pre-scan pass
     for (i = 0; i < n_coeffs; i++) {
       rc = scan[i];
-      z = coeff_ptr[rc] * 2;
+      z = coeff_ptr[rc];
 
       // If the coefficient is out of the base ZBIN range, keep it for
       // quantization.
@@ -127,31 +127,52 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
     for (i = 0; i < idx; i++) {
       rc = scan[idx_arr[i]];
 
-      // Calculate ZBIN
-      zbin = (zbins[rc != 0]);
-
-      z = coeff_ptr[rc] * 2;
+      z = coeff_ptr[rc];
       sz = (z >> 31);                               // sign of z
       x  = (z ^ sz) - sz;                           // x = abs(z)
 
-      if (x >= zbin) {
-        x += (round_ptr[rc != 0]);
-        y  = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
-              quant_shift_ptr[rc != 0]) >> 16;      // quantize (x)
+      x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      x  = clamp(x, INT16_MIN, INT16_MAX);
+      y  = ((((x * quant_ptr[rc != 0]) >> 16) + x) *
+            quant_shift_ptr[rc != 0]) >> 15;      // quantize (x)
 
-        x  = (y ^ sz) - sz;                         // get the sign back
-        qcoeff_ptr[rc]  = x;                        // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2;  // dequantized value
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc]  = x;                        // write to destination
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2;  // dequantized value
 
-        if (y) {
-          eob = idx_arr[i];                         // last nonzero coeffs
-        }
-      }
+      if (y)
+        eob = idx_arr[i];                         // last nonzero coeffs
     }
   }
   *eob_ptr = eob + 1;
 }
 
+struct plane_block_idx {
+  int plane;
+  int block;
+};
+
+// TODO(jkoleszar): returning a struct so it can be used in a const context,
+// expect to refactor this further later.
+static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
+                                                     int b_idx) {
+  const int v_offset = y_blocks * 5 / 4;
+  struct plane_block_idx res;
+
+  if (b_idx < y_blocks) {
+    res.plane = 0;
+    res.block = b_idx;
+  } else if (b_idx < v_offset) {
+    res.plane = 1;
+    res.block = b_idx - y_blocks;
+  } else {
+    assert(b_idx < y_blocks * 3 / 2);
+    res.plane = 2;
+    res.block = b_idx - v_offset;
+  }
+  return res;
+}
+
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks) {
   MACROBLOCKD *const xd = &mb->e_mbd;
@@ -159,14 +180,14 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
   const int16_t *scan = get_scan_4x4(tx_type);
   const int16_t *iscan = get_iscan_4x4(tx_type);
 
-  vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
+  vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block),
            16, mb->skip_block,
            mb->plane[pb_idx.plane].zbin,
            mb->plane[pb_idx.plane].round,
            mb->plane[pb_idx.plane].quant,
            mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
+           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block),
+           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block),
            xd->plane[pb_idx.plane].dequant,
            mb->plane[pb_idx.plane].zbin_extra,
            &xd->plane[pb_idx.plane].eobs[pb_idx.block],
@@ -185,63 +206,43 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) {
 }
 
 void vp9_init_quantizer(VP9_COMP *cpi) {
-  int i;
-  int quant_val;
-  int quant_uv_val;
-#if CONFIG_ALPHA
-  int quant_alpha_val;
-#endif
-  int q;
+  int i, q;
+  VP9_COMMON *const cm = &cpi->common;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
-    int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80;
-    int qrounding_factor = 48;
-    if (q == 0) {
-      qzbin_factor = 64;
-      qrounding_factor = 64;
+    const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80);
+    const int qrounding_factor = q == 0 ? 64 : 48;
+
+    // y
+    for (i = 0; i < 2; ++i) {
+      const int quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
+                               : vp9_ac_quant(q, 0);
+      invert_quant(&cpi->y_quant[q][i], &cpi->y_quant_shift[q][i], quant);
+      cpi->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      cpi->y_round[q][i] = (qrounding_factor * quant) >> 7;
+      cm->y_dequant[q][i] = quant;
     }
 
-    // dc values
-    quant_val = vp9_dc_quant(q, cpi->common.y_dc_delta_q);
-    invert_quant(cpi->y_quant[q] + 0, cpi->y_quant_shift[q] + 0, quant_val);
-    cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-    cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7;
-    cpi->common.y_dequant[q][0] = quant_val;
-
-    quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q);
-    invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val);
-    cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-    cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7;
-    cpi->common.uv_dequant[q][0] = quant_val;
-
-#if CONFIG_ALPHA
-    quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q);
-    invert_quant(cpi->a_quant[q] + 0, cpi->a_quant_shift[q] + 0, quant_val);
-    cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-    cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7;
-    cpi->common.a_dequant[q][0] = quant_val;
-#endif
-
-    quant_val = vp9_ac_quant(q, 0);
-    invert_quant(cpi->y_quant[q] + 1, cpi->y_quant_shift[q] + 1, quant_val);
-    cpi->y_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-    cpi->y_round[q][1] = (qrounding_factor * quant_val) >> 7;
-    cpi->common.y_dequant[q][1] = quant_val;
-
-    quant_uv_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);
-    invert_quant(cpi->uv_quant[q] + 1, cpi->uv_quant_shift[q] + 1,
-                 quant_uv_val);
-    cpi->uv_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
-    cpi->uv_round[q][1] = (qrounding_factor * quant_uv_val) >> 7;
-    cpi->common.uv_dequant[q][1] = quant_uv_val;
+    // uv
+    for (i = 0; i < 2; ++i) {
+      const int quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q)
+                               : vp9_ac_quant(q, cm->uv_ac_delta_q);
+      invert_quant(&cpi->uv_quant[q][i], &cpi->uv_quant_shift[q][i], quant);
+      cpi->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      cpi->uv_round[q][i] = (qrounding_factor * quant) >> 7;
+      cm->uv_dequant[q][i] = quant;
+    }
 
 #if CONFIG_ALPHA
-    quant_alpha_val = vp9_ac_quant(q, cpi->common.a_ac_delta_q);
-    invert_quant(cpi->a_quant[q] + 1, cpi->a_quant_shift[q] + 1,
-                 quant_alpha_val);
-    cpi->a_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);
-    cpi->a_round[q][1] = (qrounding_factor * quant_alpha_val) >> 7;
-    cpi->common.a_dequant[q][1] = quant_alpha_val;
+    // alpha
+    for (i = 0; i < 2; ++i) {
+      const int quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q)
+                               : vp9_ac_quant(q, cm->a_ac_delta_q);
+      invert_quant(&cpi->a_quant[q][i], &cpi->a_quant_shift[q][i], quant);
+      cpi->a_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      cpi->a_round[q][i] = (qrounding_factor * quant) >> 7;
+      cm->a_dequant[q][i] = quant;
+    }
 #endif
 
     for (i = 2; i < 8; i++) {
@@ -249,20 +250,20 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       cpi->y_quant_shift[q][i] = cpi->y_quant_shift[q][1];
       cpi->y_zbin[q][i] = cpi->y_zbin[q][1];
       cpi->y_round[q][i] = cpi->y_round[q][1];
-      cpi->common.y_dequant[q][i] = cpi->common.y_dequant[q][1];
+      cm->y_dequant[q][i] = cm->y_dequant[q][1];
 
       cpi->uv_quant[q][i] = cpi->uv_quant[q][1];
       cpi->uv_quant_shift[q][i] = cpi->uv_quant_shift[q][1];
       cpi->uv_zbin[q][i] = cpi->uv_zbin[q][1];
       cpi->uv_round[q][i] = cpi->uv_round[q][1];
-      cpi->common.uv_dequant[q][i] = cpi->common.uv_dequant[q][1];
+      cm->uv_dequant[q][i] = cm->uv_dequant[q][1];
 
 #if CONFIG_ALPHA
       cpi->a_quant[q][i] = cpi->a_quant[q][1];
       cpi->a_quant_shift[q][i] = cpi->a_quant_shift[q][1];
       cpi->a_zbin[q][i] = cpi->a_zbin[q][1];
       cpi->a_round[q][i] = cpi->a_round[q][1];
-      cpi->common.a_dequant[q][i] = cpi->common.a_dequant[q][1];
+      cm->a_dequant[q][i] = cm->a_dequant[q][1];
 #endif
     }
   }
@@ -272,8 +273,9 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
   int i;
   MACROBLOCKD *xd = &x->e_mbd;
   int zbin_extra;
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int qindex = vp9_get_qindex(xd, segment_id, cpi->common.base_qindex);
+  int segment_id = xd->this_mi->mbmi.segment_id;
+  const int qindex = vp9_get_qindex(&cpi->common.seg, segment_id,
+                                    cpi->common.base_qindex);
 
   // Y
   zbin_extra = (cpi->common.y_dequant[qindex][1] *
@@ -308,7 +310,8 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
   x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];
 #endif
 
-  x->skip_block = vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP);
+  x->skip_block = vp9_segfeature_active(&cpi->common.seg, segment_id,
+                                        SEG_LVL_SKIP);
 
   /* save this macroblock QIndex for vp9_update_zbin_extra() */
   x->e_mbd.q_index = qindex;
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c
index d3a9529..2d12ba9 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -71,7 +71,6 @@ int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
 void vp9_save_coding_context(VP9_COMP *cpi) {
   CODING_CONTEXT *const cc = &cpi->coding_context;
   VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
   // Stores a snapshot of key state variables which can subsequently be
   // restored with a call to vp9_restore_coding_context. These functions are
@@ -89,7 +88,7 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
   vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
   vp9_copy(cc->partition_prob, cm->fc.partition_prob);
 
-  vp9_copy(cc->segment_pred_probs, xd->seg.pred_probs);
+  vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
 
   vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob);
   vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob);
@@ -99,8 +98,8 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
   vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
              cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
 
-  vp9_copy(cc->last_ref_lf_deltas, xd->lf.last_ref_deltas);
-  vp9_copy(cc->last_mode_lf_deltas, xd->lf.last_mode_deltas);
+  vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
+  vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
 
   vp9_copy(cc->coef_probs, cm->fc.coef_probs);
   vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
@@ -111,7 +110,6 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
 void vp9_restore_coding_context(VP9_COMP *cpi) {
   CODING_CONTEXT *const cc = &cpi->coding_context;
   VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
   // Restore key state variables to the snapshot state stored in the
   // previous call to vp9_save_coding_context.
@@ -127,7 +125,7 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
   vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
   vp9_copy(cm->fc.partition_prob, cc->partition_prob);
 
-  vp9_copy(xd->seg.pred_probs, cc->segment_pred_probs);
+  vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
 
   vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob);
   vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob);
@@ -138,8 +136,8 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
              cpi->coding_context.last_frame_seg_map_copy,
              (cm->mi_rows * cm->mi_cols));
 
-  vp9_copy(xd->lf.last_ref_deltas, cc->last_ref_lf_deltas);
-  vp9_copy(xd->lf.last_mode_deltas, cc->last_mode_lf_deltas);
+  vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
+  vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
 
   vp9_copy(cm->fc.coef_probs, cc->coef_probs);
   vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
@@ -149,9 +147,8 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
 
 void vp9_setup_key_frame(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
-  vp9_setup_past_independence(cm, xd);
+  vp9_setup_past_independence(cm);
 
   // interval before next GF
   cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
@@ -162,9 +159,8 @@ void vp9_setup_key_frame(VP9_COMP *cpi) {
 
 void vp9_setup_inter_frame(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
   if (cm->error_resilient_mode || cm->intra_only)
-    vp9_setup_past_independence(cm, xd);
+    vp9_setup_past_independence(cm);
 
   assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS);
   cm->fc = cm->frame_contexts[cm->frame_context_idx];
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index 2d93250..df00334 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include <stdio.h>
 #include <math.h>
 #include <limits.h>
@@ -49,65 +48,66 @@
 DECLARE_ALIGNED(16, extern const uint8_t,
                 vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 
-#define I4X4_PRED 0x8000
-#define SPLITMV 0x10000
+#define LAST_FRAME_MODE_MASK    0xFFDADCD60
+#define GOLDEN_FRAME_MODE_MASK  0xFFB5A3BB0
+#define ALT_REF_MODE_MASK       0xFF8C648D0
 
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  {NEARESTMV, LAST_FRAME,   NONE},
-  {DC_PRED,   INTRA_FRAME,  NONE},
-
-  {NEARESTMV, ALTREF_FRAME, NONE},
-  {NEARESTMV, GOLDEN_FRAME, NONE},
-  {NEWMV,     LAST_FRAME,   NONE},
-  {NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
-  {NEARMV,    LAST_FRAME,   NONE},
-  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
-
-  {NEWMV,     GOLDEN_FRAME, NONE},
-  {NEWMV,     ALTREF_FRAME, NONE},
-  {NEARMV,    ALTREF_FRAME, NONE},
-
-  {TM_PRED,   INTRA_FRAME,  NONE},
-
-  {NEARMV,    LAST_FRAME,   ALTREF_FRAME},
-  {NEWMV,     LAST_FRAME,   ALTREF_FRAME},
-  {NEARMV,    GOLDEN_FRAME, NONE},
-  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
-  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
-
-  {SPLITMV,   LAST_FRAME,   NONE},
-  {SPLITMV,   GOLDEN_FRAME, NONE},
-  {SPLITMV,   ALTREF_FRAME, NONE},
-  {SPLITMV,   LAST_FRAME,   ALTREF_FRAME},
-  {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
-
-  {ZEROMV,    LAST_FRAME,   NONE},
-  {ZEROMV,    GOLDEN_FRAME, NONE},
-  {ZEROMV,    ALTREF_FRAME, NONE},
-  {ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
-  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
-
-  {I4X4_PRED, INTRA_FRAME,  NONE},
-  {H_PRED,    INTRA_FRAME,  NONE},
-  {V_PRED,    INTRA_FRAME,  NONE},
-  {D135_PRED, INTRA_FRAME,  NONE},
-  {D27_PRED,  INTRA_FRAME,  NONE},
-  {D153_PRED, INTRA_FRAME,  NONE},
-  {D63_PRED,  INTRA_FRAME,  NONE},
-  {D117_PRED, INTRA_FRAME,  NONE},
-  {D45_PRED,  INTRA_FRAME,  NONE},
+  {RD_NEARESTMV, LAST_FRAME,   NONE},
+  {RD_NEARESTMV, ALTREF_FRAME, NONE},
+  {RD_NEARESTMV, GOLDEN_FRAME, NONE},
+
+  {RD_DC_PRED,   INTRA_FRAME,  NONE},
+
+  {RD_NEWMV,     LAST_FRAME,   NONE},
+  {RD_NEWMV,     ALTREF_FRAME, NONE},
+  {RD_NEWMV,     GOLDEN_FRAME, NONE},
+
+  {RD_NEARMV,    LAST_FRAME,   NONE},
+  {RD_NEARMV,    ALTREF_FRAME, NONE},
+  {RD_NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
+  {RD_NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
+
+  {RD_TM_PRED,   INTRA_FRAME,  NONE},
+
+  {RD_NEARMV,    LAST_FRAME,   ALTREF_FRAME},
+  {RD_NEWMV,     LAST_FRAME,   ALTREF_FRAME},
+  {RD_NEARMV,    GOLDEN_FRAME, NONE},
+  {RD_NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
+  {RD_NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
+
+  {RD_SPLITMV,   LAST_FRAME,   NONE},
+  {RD_SPLITMV,   GOLDEN_FRAME, NONE},
+  {RD_SPLITMV,   ALTREF_FRAME, NONE},
+  {RD_SPLITMV,   LAST_FRAME,   ALTREF_FRAME},
+  {RD_SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
+
+  {RD_ZEROMV,    LAST_FRAME,   NONE},
+  {RD_ZEROMV,    GOLDEN_FRAME, NONE},
+  {RD_ZEROMV,    ALTREF_FRAME, NONE},
+  {RD_ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
+  {RD_ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
+
+  {RD_I4X4_PRED, INTRA_FRAME,  NONE},
+  {RD_H_PRED,    INTRA_FRAME,  NONE},
+  {RD_V_PRED,    INTRA_FRAME,  NONE},
+  {RD_D135_PRED, INTRA_FRAME,  NONE},
+  {RD_D207_PRED, INTRA_FRAME,  NONE},
+  {RD_D153_PRED, INTRA_FRAME,  NONE},
+  {RD_D63_PRED,  INTRA_FRAME,  NONE},
+  {RD_D117_PRED, INTRA_FRAME,  NONE},
+  {RD_D45_PRED,  INTRA_FRAME,  NONE},
 };
 
 // The baseline rd thresholds for breaking out of the rd loop for
 // certain modes are assumed to be based on 8x8 blocks.
 // This table is used to correct for blocks size.
 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
-static int rd_thresh_block_size_factor[BLOCK_SIZE_TYPES] =
+static int rd_thresh_block_size_factor[BLOCK_SIZES] =
   {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
 
-#define BASE_RD_THRESH_FREQ_FACT 16
-#define MAX_RD_THRESH_FREQ_FACT 32
-#define MAX_RD_THRESH_FREQ_INC 1
+#define MAX_RD_THRESH_FACT 64
+#define RD_THRESH_INC 1
 
 static void fill_token_costs(vp9_coeff_cost *c,
                              vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
@@ -160,6 +160,15 @@ static int compute_rd_mult(int qindex) {
   return (11 * q * q) >> 2;
 }
 
+static MB_PREDICTION_MODE rd_mode_to_mode(RD_PREDICTION_MODE rd_mode) {
+  if (rd_mode == RD_SPLITMV || rd_mode == RD_I4X4_PRED) {
+    assert(!"Invalid rd_mode");
+    return MB_MODE_COUNT;
+  }
+  assert((int)rd_mode < (int)MB_MODE_COUNT);
+  return (MB_PREDICTION_MODE)rd_mode;
+}
+
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
   cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
   cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
@@ -199,7 +208,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
     cpi->RDDIV = 1;
     cpi->RDMULT /= 100;
 
-    for (bsize = 0; bsize < BLOCK_SIZE_TYPES; ++bsize) {
+    for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
       for (i = 0; i < MAX_MODES; ++i) {
         // Threshold here seem unecessarily harsh but fine given actual
         // range of values used for cpi->sf.thresh_mult[]
@@ -213,18 +222,12 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
         } else {
           cpi->rd_threshes[bsize][i] = INT_MAX;
         }
-        cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
-
-        if (cpi->sf.adaptive_rd_thresh)
-          cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
-        else
-          cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
       }
     }
   } else {
     cpi->RDDIV = 100;
 
-    for (bsize = 0; bsize < BLOCK_SIZE_TYPES; ++bsize) {
+    for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
       for (i = 0; i < MAX_MODES; i++) {
         // Threshold here seem unecessarily harsh but fine given actual
         // range of values used for cpi->sf.thresh_mult[]
@@ -237,12 +240,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
         } else {
           cpi->rd_threshes[bsize][i] = INT_MAX;
         }
-        cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
-
-        if (cpi->sf.adaptive_rd_thresh)
-          cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
-        else
-          cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
       }
     }
   }
@@ -277,16 +274,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
   }
 }
 
-static INLINE BLOCK_SIZE_TYPE get_block_size(int bwl, int bhl) {
-  return bsize_from_dim_lookup[bwl][bhl];
-}
-
-static BLOCK_SIZE_TYPE get_plane_block_size(BLOCK_SIZE_TYPE bsize,
-                                            struct macroblockd_plane *pd) {
-  return get_block_size(plane_block_width_log2by4(bsize, pd),
-                        plane_block_height_log2by4(bsize, pd));
-}
-
 static INLINE void linear_interpolate2(double x, int ntab, int inv_step,
                                        const double *tab1, const double *tab2,
                                        double *v1, double *v2) {
@@ -388,7 +375,7 @@ static void model_rd_from_var_lapndz(int var, int n, int qstep,
   vp9_clear_system_state();
 }
 
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
                             MACROBLOCK *x, MACROBLOCKD *xd,
                             int *out_rate_sum, int64_t *out_dist_sum) {
   // Note our transform coeffs are 8 times an orthogonal transform.
@@ -399,18 +386,14 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
-
-    // TODO(dkovalev) the same code in get_plane_block_size
-    const int bwl = plane_block_width_log2by4(bsize, pd);
-    const int bhl = plane_block_height_log2by4(bsize, pd);
-    const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl);
+    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
     unsigned int sse;
     int rate;
     int64_t dist;
     (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
                               pd->dst.buf, pd->dst.stride, &sse);
     // sse works better than var, since there is no dc prediction used
-    model_rd_from_var_lapndz(sse, 16 << (bwl + bhl),
+    model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
                              pd->dequant[1] >> 3, &rate, &dist);
 
     rate_sum += rate;
@@ -421,81 +404,52 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
   *out_dist_sum = dist_sum << 4;
 }
 
-static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
-                              MACROBLOCK *x, MACROBLOCKD *xd,
-                              int *out_rate_sum, int64_t *out_dist_sum) {
-  // Note our transform coeffs are 8 times an orthogonal transform.
-  // Hence quantizer step is also 8 times. To get effective quantizer
-  // we need to divide by 8 before sending to modeling function.
-  struct macroblock_plane *const p = &x->plane[0];
-  struct macroblockd_plane *const pd = &xd->plane[0];
-
-  // TODO(dkovalev) the same code in get_plane_block_size
-  const int bwl = plane_block_width_log2by4(bsize, pd);
-  const int bhl = plane_block_height_log2by4(bsize, pd);
-  const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl);
-  unsigned int sse;
-  int rate;
-  int64_t dist;
-  (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
-                            pd->dst.buf, pd->dst.stride, &sse);
-  // sse works better than var, since there is no dc prediction used
-  model_rd_from_var_lapndz(sse, 16 << (bwl + bhl),
-                           pd->dequant[1] >> 3, &rate, &dist);
-
-  *out_rate_sum = rate;
-  *out_dist_sum = dist << 4;
-}
-
-static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                  TX_SIZE tx_size,
                                  MACROBLOCK *x, MACROBLOCKD *xd,
                                  int *out_rate_sum, int64_t *out_dist_sum,
                                  int *out_skip) {
-  int t = 4, j, k;
-  BLOCK_SIZE_TYPE bs = BLOCK_4X4;
+  int j, k;
+  BLOCK_SIZE bs;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
-  const int width = plane_block_width(bsize, pd);
-  const int height = plane_block_height(bsize, pd);
+  const int width = 4 << num_4x4_blocks_wide_lookup[bsize];
+  const int height = 4 << num_4x4_blocks_high_lookup[bsize];
   int rate_sum = 0;
   int64_t dist_sum = 0;
+  const int t = 4 << tx_size;
 
   if (tx_size == TX_4X4) {
     bs = BLOCK_4X4;
-    t = 4;
   } else if (tx_size == TX_8X8) {
     bs = BLOCK_8X8;
-    t = 8;
   } else if (tx_size == TX_16X16) {
     bs = BLOCK_16X16;
-    t = 16;
   } else if (tx_size == TX_32X32) {
     bs = BLOCK_32X32;
-    t = 32;
   } else {
     assert(0);
   }
+
   *out_skip = 1;
   for (j = 0; j < height; j += t) {
     for (k = 0; k < width; k += t) {
       int rate;
       int64_t dist;
       unsigned int sse;
-      (void) cpi->fn_ptr[bs].vf(p->src.buf + j * p->src.stride + k,
-                                p->src.stride,
-                                pd->dst.buf + j * pd->dst.stride + k,
-                                pd->dst.stride, &sse);
+      cpi->fn_ptr[bs].vf(&p->src.buf[j * p->src.stride + k], p->src.stride,
+                         &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride,
+                         &sse);
       // sse works better than var, since there is no dc prediction used
-      model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,
-                               &rate, &dist);
+      model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, &rate, &dist);
       rate_sum += rate;
       dist_sum += dist;
       *out_skip &= (rate < 1024);
     }
   }
+
   *out_rate_sum = rate_sum;
-  *out_dist_sum = (dist_sum << 4);
+  *out_dist_sum = dist_sum << 4;
 }
 
 int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
@@ -526,42 +480,39 @@ static const int16_t band_counts[TX_SIZES][8] = {
 };
 
 static INLINE int cost_coeffs(MACROBLOCK *mb,
-                              int plane, int block, PLANE_TYPE type,
+                              int plane, int block,
                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
                               TX_SIZE tx_size,
                               const int16_t *scan, const int16_t *nb) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  int pt, c, cost;
+  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const PLANE_TYPE type = pd->plane_type;
   const int16_t *band_count = &band_counts[tx_size][1];
-  const int eob = xd->plane[plane].eobs[block];
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
+  const int eob = pd->eobs[block];
+  const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
-  unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS]
-                    [MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref];
-  ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
+  unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
+                   mb->token_costs[tx_size][type][ref];
+  const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
   uint8_t token_cache[1024];
+  int pt = combine_entropy_contexts(above_ec, left_ec);
+  int c, cost;
 
   // Check for consistency of tx_size with mode info
-  assert((!type && !plane) || (type && plane));
-  if (type == PLANE_TYPE_Y_WITH_DC) {
-    assert(xd->mode_info_context->mbmi.txfm_size == tx_size);
-  } else {
-    assert(tx_size == get_uv_tx_size(mbmi));
-  }
-
-  pt = combine_entropy_contexts(above_ec, left_ec);
+  assert(type == PLANE_TYPE_Y_WITH_DC ? mbmi->tx_size == tx_size
+                                      : get_uv_tx_size(mbmi) == tx_size);
 
   if (eob == 0) {
     // single eob token
     cost = token_costs[0][0][pt][DCT_EOB_TOKEN];
     c = 0;
   } else {
-    int v, prev_t, band_left = *band_count++;
+    int band_left = *band_count++;
 
     // dc token
-    v = qcoeff_ptr[0];
-    prev_t = vp9_dct_value_tokens_ptr[v].token;
+    int v = qcoeff_ptr[0];
+    int prev_t = vp9_dct_value_tokens_ptr[v].token;
     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
     token_cache[0] = vp9_pt_energy_class[prev_t];
     ++token_costs;
@@ -591,13 +542,12 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
   }
 
   // is eob first coefficient;
-  *A = *L = c > 0;
+  *A = *L = (c > 0);
 
   return cost;
 }
 
 struct rdcost_block_args {
-  VP9_COMMON *cm;
   MACROBLOCK *x;
   ENTROPY_CONTEXT t_above[16];
   ENTROPY_CONTEXT t_left[16];
@@ -612,23 +562,23 @@ struct rdcost_block_args {
   const int16_t *scan, *nb;
 };
 
-static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                       int ss_txfrm_size, void *arg) {
+static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
+  const int ss_txfrm_size = tx_size << 1;
   struct rdcost_block_args* args = arg;
   MACROBLOCK* const x = args->x;
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct macroblock_plane *const p = &x->plane[0];
-  struct macroblockd_plane *const pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
   int64_t this_sse;
   int shift = args->tx_size == TX_32X32 ? 0 : 2;
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block, 16);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
                                 &this_sse) >> shift;
   args->sse += this_sse >> shift;
 
   if (x->skip_encode &&
-      xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
+      xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
     // TODO(jingning): tune the model to better capture the distortion.
     int64_t p = (pd->dequant[1] * pd->dequant[1] *
                     (1 << ss_txfrm_size)) >> shift;
@@ -637,119 +587,25 @@ static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
   }
 }
 
-static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                       int ss_txfrm_size, void *arg) {
+static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
+                       TX_SIZE tx_size, void *arg) {
   struct rdcost_block_args* args = arg;
-  int x_idx, y_idx;
-  MACROBLOCKD * const xd = &args->x->e_mbd;
 
-  txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
-                           &y_idx);
+  int x_idx, y_idx;
+  txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx);
 
   args->rate += cost_coeffs(args->x, plane, block,
-                            xd->plane[plane].plane_type, args->t_above + x_idx,
+                            args->t_above + x_idx,
                             args->t_left + y_idx, args->tx_size,
                             args->scan, args->nb);
 }
 
-// FIXME(jingning): need to make the rd test of chroma components consistent
-// with that of luma component. this function should be deprecated afterwards.
-static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
-                        BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
-  MACROBLOCKD * const xd = &x->e_mbd;
-  const int bwl = plane_block_width_log2by4(bsize, &xd->plane[plane]);
-  const int bhl = plane_block_height_log2by4(bsize, &xd->plane[plane]);
-  const int bw = 1 << bwl, bh = 1 << bhl;
-  int i;
-  struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
-    0, 0, 0, INT64_MAX, 0 };
-
-  switch (tx_size) {
-    case TX_4X4:
-      vpx_memcpy(&args.t_above, xd->plane[plane].above_context,
-                 sizeof(ENTROPY_CONTEXT) * bw);
-      vpx_memcpy(&args.t_left, xd->plane[plane].left_context,
-                 sizeof(ENTROPY_CONTEXT) * bh);
-      args.scan = vp9_default_scan_4x4;
-      args.nb = vp9_default_scan_4x4_neighbors;
-      break;
-    case TX_8X8:
-      for (i = 0; i < bw; i += 2)
-        args.t_above[i] = !!*(uint16_t *)&xd->plane[plane].above_context[i];
-      for (i = 0; i < bh; i += 2)
-        args.t_left[i] = !!*(uint16_t *)&xd->plane[plane].left_context[i];
-      args.scan = vp9_default_scan_8x8;
-      args.nb = vp9_default_scan_8x8_neighbors;
-      break;
-    case TX_16X16:
-      for (i = 0; i < bw; i += 4)
-        args.t_above[i] = !!*(uint32_t *)&xd->plane[plane].above_context[i];
-      for (i = 0; i < bh; i += 4)
-        args.t_left[i] = !!*(uint32_t *)&xd->plane[plane].left_context[i];
-      args.scan = vp9_default_scan_16x16;
-      args.nb = vp9_default_scan_16x16_neighbors;
-      break;
-    case TX_32X32:
-      for (i = 0; i < bw; i += 8)
-        args.t_above[i] = !!*(uint64_t *)&xd->plane[plane].above_context[i];
-      for (i = 0; i < bh; i += 8)
-        args.t_left[i] = !!*(uint64_t *)&xd->plane[plane].left_context[i];
-      args.scan = vp9_default_scan_32x32;
-      args.nb = vp9_default_scan_32x32_neighbors;
-      break;
-    default:
-      assert(0);
-  }
-
-  foreach_transformed_block_in_plane(xd, bsize, plane, rate_block, &args);
-  return args.rate;
-}
-
-static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
-                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
-  int cost = 0, plane;
-
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    cost += rdcost_plane(cm, x, plane, bsize, tx_size);
-  }
-  return cost;
-}
-
-static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
-                           int shift, int64_t *sse) {
-  struct macroblockd_plane *p = &x->e_mbd.plane[0];
-  const int bwl = plane_block_width_log2by4(bsize, p);
-  const int bhl = plane_block_height_log2by4(bsize, p);
-  int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
-                              16 << (bwl + bhl), sse) >> shift;
-  *sse >>= shift;
-  return e;
-}
-
-static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
-                                int shift, int64_t *sse) {
-  int64_t sum = 0, this_sse;
-  int plane;
-
-  *sse = 0;
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    struct macroblockd_plane *p = &x->e_mbd.plane[plane];
-    const int bwl = plane_block_width_log2by4(bsize, p);
-    const int bhl = plane_block_height_log2by4(bsize, p);
-    sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
-                           16 << (bwl + bhl), &this_sse);
-    *sse += this_sse;
-  }
-  *sse >>= shift;
-  return sum >> shift;
-}
-
-static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                           int ss_txfrm_size, void *arg) {
+static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
+                           TX_SIZE tx_size, void *arg) {
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct encode_b_args encode_args = {args->cm, x, NULL};
+  struct encode_b_args encode_args = {x, NULL};
   int64_t rd1, rd2, rd;
 
   if (args->skip)
@@ -765,58 +621,61 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize,
     return;
   }
 
-  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
-    encode_block_intra(plane, block, bsize, ss_txfrm_size, &encode_args);
+  if (!is_inter_block(&xd->this_mi->mbmi))
+    vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args);
   else
-    xform_quant(plane, block, bsize, ss_txfrm_size, &encode_args);
+    vp9_xform_quant(plane, block, plane_bsize, tx_size, &encode_args);
 
-  dist_block(plane, block, bsize, ss_txfrm_size, args);
-  rate_block(plane, block, bsize, ss_txfrm_size, args);
+  dist_block(plane, block, tx_size, args);
+  rate_block(plane, block, plane_bsize, tx_size, args);
 }
 
-static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
-                                     int *rate, int64_t *distortion,
-                                     int *skippable, int64_t *sse,
-                                     int64_t ref_best_rd,
-                                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+static void txfm_rd_in_plane(MACROBLOCK *x,
+                             int *rate, int64_t *distortion,
+                             int *skippable, int64_t *sse,
+                             int64_t ref_best_rd, int plane,
+                             BLOCK_SIZE bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  const int bwl = plane_block_width_log2by4(bsize, pd);
-  const int bhl = plane_block_height_log2by4(bsize, pd);
-  const int bw = 1 << bwl, bh = 1 << bhl;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs];
   int i;
-  struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
+  struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size,
+                                    num_4x4_blocks_wide, num_4x4_blocks_high,
                                     0, 0, 0, ref_best_rd, 0 };
-  xd->mode_info_context->mbmi.txfm_size = tx_size;
+  if (plane == 0)
+    xd->this_mi->mbmi.tx_size = tx_size;
+
   switch (tx_size) {
     case TX_4X4:
       vpx_memcpy(&args.t_above, pd->above_context,
-                 sizeof(ENTROPY_CONTEXT) * bw);
+                 sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide);
       vpx_memcpy(&args.t_left, pd->left_context,
-                 sizeof(ENTROPY_CONTEXT) * bh);
-      get_scan_nb_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, 0),
+                 sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high);
+      get_scan_nb_4x4(get_tx_type_4x4(pd->plane_type, xd, 0),
                       &args.scan, &args.nb);
       break;
     case TX_8X8:
-      for (i = 0; i < bw; i += 2)
+      for (i = 0; i < num_4x4_blocks_wide; i += 2)
         args.t_above[i] = !!*(uint16_t *)&pd->above_context[i];
-      for (i = 0; i < bh; i += 2)
+      for (i = 0; i < num_4x4_blocks_high; i += 2)
         args.t_left[i] = !!*(uint16_t *)&pd->left_context[i];
-      get_scan_nb_8x8(get_tx_type_8x8(PLANE_TYPE_Y_WITH_DC, xd),
+      get_scan_nb_8x8(get_tx_type_8x8(pd->plane_type, xd),
                       &args.scan, &args.nb);
       break;
     case TX_16X16:
-      for (i = 0; i < bw; i += 4)
+      for (i = 0; i < num_4x4_blocks_wide; i += 4)
         args.t_above[i] = !!*(uint32_t *)&pd->above_context[i];
-      for (i = 0; i < bh; i += 4)
+      for (i = 0; i < num_4x4_blocks_high; i += 4)
         args.t_left[i] = !!*(uint32_t *)&pd->left_context[i];
-      get_scan_nb_16x16(get_tx_type_16x16(PLANE_TYPE_Y_WITH_DC, xd),
+      get_scan_nb_16x16(get_tx_type_16x16(pd->plane_type, xd),
                         &args.scan, &args.nb);
       break;
     case TX_32X32:
-      for (i = 0; i < bw; i += 8)
+      for (i = 0; i < num_4x4_blocks_wide; i += 8)
         args.t_above[i] = !!*(uint64_t *)&pd->above_context[i];
-      for (i = 0; i < bh; i += 8)
+      for (i = 0; i < num_4x4_blocks_high; i += 8)
         args.t_left[i] = !!*(uint64_t *)&pd->left_context[i];
       args.scan = vp9_default_scan_32x32;
       args.nb = vp9_default_scan_32x32_neighbors;
@@ -825,40 +684,39 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
       assert(0);
   }
 
-  foreach_transformed_block_in_plane(xd, bsize, 0, block_yrd_txfm, &args);
+  foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args);
   *distortion = args.dist;
   *rate       = args.rate;
   *sse        = args.sse;
-  *skippable  = vp9_sby_is_skippable(xd, bsize) && (!args.skip);
+  *skippable  = vp9_is_skippable_in_plane(xd, bsize, plane) && (!args.skip);
 }
 
 static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
                                      int *rate, int64_t *distortion,
                                      int *skip, int64_t *sse,
                                      int64_t ref_best_rd,
-                                     BLOCK_SIZE_TYPE bs) {
-  const TX_SIZE max_txfm_size = TX_32X32
-      - (bs < BLOCK_32X32) - (bs < BLOCK_16X16);
+                                     BLOCK_SIZE bs) {
+  const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
   if (max_txfm_size == TX_32X32 &&
       (cm->tx_mode == ALLOW_32X32 ||
        cm->tx_mode == TX_MODE_SELECT)) {
-    mbmi->txfm_size = TX_32X32;
+    mbmi->tx_size = TX_32X32;
   } else if (max_txfm_size >= TX_16X16 &&
              (cm->tx_mode == ALLOW_16X16 ||
               cm->tx_mode == ALLOW_32X32 ||
               cm->tx_mode == TX_MODE_SELECT)) {
-    mbmi->txfm_size = TX_16X16;
+    mbmi->tx_size = TX_16X16;
   } else if (cm->tx_mode != ONLY_4X4) {
-    mbmi->txfm_size = TX_8X8;
+    mbmi->tx_size = TX_8X8;
   } else {
-    mbmi->txfm_size = TX_4X4;
+    mbmi->tx_size = TX_4X4;
   }
-  super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
-                           &sse[mbmi->txfm_size], ref_best_rd, bs,
-                           mbmi->txfm_size);
+  txfm_rd_in_plane(x, rate, distortion, skip,
+                   &sse[mbmi->tx_size], ref_best_rd, 0, bs,
+                   mbmi->tx_size);
   cpi->txfm_stepdown_count[0]++;
 }
 
@@ -867,18 +725,17 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
                                      int64_t *d, int64_t *distortion,
                                      int *s, int *skip,
                                      int64_t tx_cache[TX_MODES],
-                                     BLOCK_SIZE_TYPE bs) {
-  const TX_SIZE max_tx_size = TX_32X32
-      - (bs < BLOCK_32X32) - (bs < BLOCK_16X16);
+                                     BLOCK_SIZE bs) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
   vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
   int64_t rd[TX_SIZES][2];
   int n, m;
   int s0, s1;
 
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs);
+  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->this_mi);
 
   for (n = TX_4X4; n <= max_tx_size; n++) {
     r[n][1] = r[n][0];
@@ -914,26 +771,26 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
        (cm->tx_mode == TX_MODE_SELECT &&
         rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
         rd[TX_32X32][1] < rd[TX_4X4][1]))) {
-    mbmi->txfm_size = TX_32X32;
+    mbmi->tx_size = TX_32X32;
   } else if (max_tx_size >= TX_16X16 &&
              (cm->tx_mode == ALLOW_16X16 ||
               cm->tx_mode == ALLOW_32X32 ||
               (cm->tx_mode == TX_MODE_SELECT &&
                rd[TX_16X16][1] < rd[TX_8X8][1] &&
                rd[TX_16X16][1] < rd[TX_4X4][1]))) {
-    mbmi->txfm_size = TX_16X16;
+    mbmi->tx_size = TX_16X16;
   } else if (cm->tx_mode == ALLOW_8X8 ||
              cm->tx_mode == ALLOW_16X16 ||
              cm->tx_mode == ALLOW_32X32 ||
            (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
-    mbmi->txfm_size = TX_8X8;
+    mbmi->tx_size = TX_8X8;
   } else {
-    mbmi->txfm_size = TX_4X4;
+    mbmi->tx_size = TX_4X4;
   }
 
-  *distortion = d[mbmi->txfm_size];
-  *rate       = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT];
-  *skip       = s[mbmi->txfm_size];
+  *distortion = d[mbmi->tx_size];
+  *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
+  *skip       = s[mbmi->tx_size];
 
   tx_cache[ONLY_4X4] = rd[TX_4X4][0];
   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
@@ -971,13 +828,11 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
                                           int64_t *d, int64_t *distortion,
                                           int *s, int *skip, int64_t *sse,
                                           int64_t ref_best_rd,
-                                          BLOCK_SIZE_TYPE bs,
-                                          int *model_used) {
-  const TX_SIZE max_txfm_size = TX_32X32
-      - (bs < BLOCK_32X32) - (bs < BLOCK_16X16);
+                                          BLOCK_SIZE bs) {
+  const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
   vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
   int64_t rd[TX_SIZES][2];
   int n, m;
@@ -985,7 +840,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
   // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
 
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs);
+  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs,  xd->this_mi);
 
   // for (n = TX_4X4; n <= max_txfm_size; n++)
   //   r[n][0] = (r[n][0] * scale_r[n]);
@@ -1023,35 +878,28 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
         rd[TX_32X32][1] <= rd[TX_16X16][1] &&
         rd[TX_32X32][1] <= rd[TX_8X8][1] &&
         rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
-    mbmi->txfm_size = TX_32X32;
+    mbmi->tx_size = TX_32X32;
   } else if (max_txfm_size >= TX_16X16 &&
              (cm->tx_mode == ALLOW_16X16 ||
               cm->tx_mode == ALLOW_32X32 ||
               (cm->tx_mode == TX_MODE_SELECT &&
                rd[TX_16X16][1] <= rd[TX_8X8][1] &&
                rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
-    mbmi->txfm_size = TX_16X16;
+    mbmi->tx_size = TX_16X16;
   } else if (cm->tx_mode == ALLOW_8X8 ||
              cm->tx_mode == ALLOW_16X16 ||
              cm->tx_mode == ALLOW_32X32 ||
            (cm->tx_mode == TX_MODE_SELECT &&
             rd[TX_8X8][1] <= rd[TX_4X4][1])) {
-    mbmi->txfm_size = TX_8X8;
+    mbmi->tx_size = TX_8X8;
   } else {
-    mbmi->txfm_size = TX_4X4;
+    mbmi->tx_size = TX_4X4;
   }
 
-  if (model_used[mbmi->txfm_size]) {
-    // Actually encode using the chosen mode if a model was used, but do not
-    // update the r, d costs
-    super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
-                             &sse[mbmi->txfm_size], ref_best_rd,
-                             bs, mbmi->txfm_size);
-  } else {
-    *distortion = d[mbmi->txfm_size];
-    *rate       = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT];
-    *skip       = s[mbmi->txfm_size];
-  }
+  // Actually encode using the chosen mode if a model was used, but do not
+  // update the r, d costs
+  txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size],
+                   ref_best_rd, 0, bs, mbmi->tx_size);
 
   if (max_txfm_size == TX_32X32 &&
       rd[TX_32X32][1] <= rd[TX_16X16][1] &&
@@ -1071,14 +919,13 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
 
 static void super_block_yrd(VP9_COMP *cpi,
                             MACROBLOCK *x, int *rate, int64_t *distortion,
-                            int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,
+                            int *skip, int64_t *psse, BLOCK_SIZE bs,
                             int64_t txfm_cache[TX_MODES],
                             int64_t ref_best_rd) {
-  VP9_COMMON *const cm = &cpi->common;
   int r[TX_SIZES][2], s[TX_SIZES];
   int64_t d[TX_SIZES], sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
 
   assert(bs == mbmi->sb_type);
   if (mbmi->ref_frame[0] > INTRA_FRAME)
@@ -1091,65 +938,43 @@ static void super_block_yrd(VP9_COMP *cpi,
     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
                              ref_best_rd, bs);
     if (psse)
-      *psse = sse[mbmi->txfm_size];
+      *psse = sse[mbmi->tx_size];
     return;
   }
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
       mbmi->ref_frame[0] > INTRA_FRAME) {
-    int model_used[TX_SIZES] = {1, 1, 1, 1};
-    if (bs >= BLOCK_32X32) {
-      if (model_used[TX_32X32])
-        model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
-                             &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
-      else
-        super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
-                                 &s[TX_32X32], &sse[TX_32X32], INT64_MAX,
-                                 bs, TX_32X32);
-    }
-    if (bs >= BLOCK_16X16) {
-      if (model_used[TX_16X16])
-        model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
-                             &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
-      else
-        super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
-                                 &s[TX_16X16], &sse[TX_16X16], INT64_MAX,
-                                 bs, TX_16X16);
-    }
-    if (model_used[TX_8X8])
-      model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
-                           &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
-    else
-      super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
-                               &sse[TX_8X8], INT64_MAX, bs, TX_8X8);
+    if (bs >= BLOCK_32X32)
+      model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
+                           &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
+    if (bs >= BLOCK_16X16)
+      model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
+                           &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
 
-    if (model_used[TX_4X4])
-      model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
-                           &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
-    else
-      super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
-                               &sse[TX_4X4], INT64_MAX, bs, TX_4X4);
+    model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
+                         &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
+
+    model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
+                         &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
 
     choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
-                                  skip, sse, ref_best_rd, bs, model_used);
+                                  skip, sse, ref_best_rd, bs);
   } else {
     if (bs >= BLOCK_32X32)
-      super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
-                               &s[TX_32X32], &sse[TX_32X32], ref_best_rd,
-                               bs, TX_32X32);
+      txfm_rd_in_plane(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
+                       &sse[TX_32X32], ref_best_rd, 0, bs, TX_32X32);
     if (bs >= BLOCK_16X16)
-      super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
-                               &s[TX_16X16], &sse[TX_16X16], ref_best_rd,
-                               bs, TX_16X16);
-    super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
-                             &sse[TX_8X8], ref_best_rd, bs, TX_8X8);
-    super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
-                             &sse[TX_4X4], ref_best_rd, bs, TX_4X4);
+      txfm_rd_in_plane(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
+                       &sse[TX_16X16], ref_best_rd, 0, bs, TX_16X16);
+    txfm_rd_in_plane(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+                     &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
+    txfm_rd_in_plane(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+                     &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
                              skip, txfm_cache, bs);
   }
   if (psse)
-    *psse = sse[mbmi->txfm_size];
+    *psse = sse[mbmi->tx_size];
 }
 
 static int conditional_skipintra(MB_PREDICTION_MODE mode,
@@ -1162,7 +987,7 @@ static int conditional_skipintra(MB_PREDICTION_MODE mode,
       best_intra_mode != V_PRED &&
       best_intra_mode != D45_PRED)
     return 1;
-  if (mode == D27_PRED &&
+  if (mode == D207_PRED &&
       best_intra_mode != H_PRED &&
       best_intra_mode != D45_PRED)
     return 1;
@@ -1179,8 +1004,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
                                      int64_t *bestdistortion,
-                                     BLOCK_SIZE_TYPE bsize,
-                                     int64_t rd_thresh) {
+                                     BLOCK_SIZE bsize, int64_t rd_thresh) {
   MB_PREDICTION_MODE mode;
   MACROBLOCKD *xd = &x->e_mbd;
   int64_t best_rd = rd_thresh;
@@ -1190,9 +1014,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  uint8_t *src_init = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, ib,
+  uint8_t *src_init = raster_block_offset_uint8(BLOCK_8X8, ib,
                                                 p->src.buf, src_stride);
-  uint8_t *dst_init = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, ib,
+  uint8_t *dst_init = raster_block_offset_uint8(BLOCK_8X8, ib,
                                                 pd->dst.buf, dst_stride);
   int16_t *src_diff, *coeff;
 
@@ -1208,11 +1032,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 
   vpx_memcpy(ta, a, sizeof(ta));
   vpx_memcpy(tl, l, sizeof(tl));
-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+  xd->this_mi->mbmi.tx_size = TX_4X4;
 
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
     int ratey = 0;
+
+    if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+      continue;
+
     // Only do the oblique modes if the best so far is
     // one of the neighboring directional modes
     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
@@ -1234,10 +1062,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
 
         block = ib + idy * 2 + idx;
-        xd->mode_info_context->bmi[block].as_mode = mode;
-        src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                             p->src_diff);
-        coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
+        xd->this_mi->bmi[block].as_mode = mode;
+        src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
+        coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         vp9_predict_intra_block(xd, block, 1,
                                 TX_4X4, mode,
                                 x->skip_encode ? src : dst,
@@ -1257,20 +1084,19 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         }
 
         scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block));
-        ratey += cost_coeffs(x, 0, block, PLANE_TYPE_Y_WITH_DC,
+        ratey += cost_coeffs(x, 0, block,
                              tempa + idx, templ + idy, TX_4X4, scan,
                              vp9_get_coef_neighbors_handle(scan));
-        distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,
-                                                          block, 16),
+        distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                       16, &ssz) >> 2;
         if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
           goto next;
 
         if (tx_type != DCT_DCT)
-          vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+          vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
                                dst, pd->dst.stride, tx_type);
         else
-          xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+          xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
                              dst, pd->dst.stride);
       }
     }
@@ -1312,7 +1138,10 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
                                             int64_t best_rd) {
   int i, j;
   MACROBLOCKD *const xd = &mb->e_mbd;
-  BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  MODE_INFO *const mic = xd->this_mi;
+  const MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO *left_mi = xd->mi_8x8[-1];
+  const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
@@ -1322,7 +1151,6 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
   int64_t total_rd = 0;
   ENTROPY_CONTEXT t_above[4], t_left[4];
   int *bmode_costs;
-  MODE_INFO *const mic = xd->mode_info_context;
 
   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
@@ -1332,24 +1160,22 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
-      const int mis = xd->mode_info_stride;
-      MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-      int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
-      int64_t UNINITIALIZED_IS_SAFE(d), this_rd;
+      MB_PREDICTION_MODE best_mode = DC_PRED;
+      int r = INT_MAX, ry = INT_MAX;
+      int64_t d = INT64_MAX, this_rd = INT64_MAX;
       i = idy * 2 + idx;
-
       if (cpi->common.frame_type == KEY_FRAME) {
-        const MB_PREDICTION_MODE A = above_block_mode(mic, i, mis);
+        const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, i);
         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
-                                     left_block_mode(mic, i) : DC_PRED;
+                                     left_block_mode(mic, left_mi, i) :
+                                     DC_PRED;
 
         bmode_costs  = mb->y_mode_costs[A][L];
       }
 
       this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
-                                      t_above + idx, t_left + idy,
-                                      &r, &ry, &d, bsize,
-                                      best_rd - total_rd);
+                                      t_above + idx, t_left + idy, &r, &ry, &d,
+                                      bsize, best_rd - total_rd);
       if (this_rd >= best_rd - total_rd)
         return INT64_MAX;
 
@@ -1372,7 +1198,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
   *rate = cost;
   *rate_y = tot_rate_y;
   *distortion = total_distortion;
-  xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode;
+  mic->mbmi.mode = mic->bmi[3].as_mode;
 
   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
 }
@@ -1380,15 +1206,16 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                       int *rate, int *rate_tokenonly,
                                       int64_t *distortion, int *skippable,
-                                      BLOCK_SIZE_TYPE bsize,
+                                      BLOCK_SIZE bsize,
                                       int64_t tx_cache[TX_MODES],
                                       int64_t best_rd) {
   MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  MB_PREDICTION_MODE mode_selected = DC_PRED;
   MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->this_mi;
   int this_rate, this_rate_tokenonly, s;
   int64_t this_distortion, this_rd;
-  TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
+  TX_SIZE best_tx = TX_4X4;
   int i;
   int *bmode_costs = x->mbmode_cost;
 
@@ -1399,17 +1226,20 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   /* Y Search for intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int64_t local_tx_cache[TX_MODES];
-    MODE_INFO *const mic = xd->mode_info_context;
-    const int mis = xd->mode_info_stride;
+    MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
+    MODE_INFO *left_mi = xd->mi_8x8[-1];
+
+    if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+      continue;
 
     if (cpi->common.frame_type == KEY_FRAME) {
-      const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
+      const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, 0);
       const MB_PREDICTION_MODE L = xd->left_available ?
-                                   left_block_mode(mic, 0) : DC_PRED;
+                                   left_block_mode(mic, left_mi, 0) : DC_PRED;
 
       bmode_costs = x->y_mode_costs[A][L];
     }
-    x->e_mbd.mode_info_context->mbmi.mode = mode;
+    mic->mbmi.mode = mode;
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
                     bsize, local_tx_cache, best_rd);
@@ -1423,7 +1253,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (this_rd < best_rd) {
       mode_selected   = mode;
       best_rd         = this_rd;
-      best_tx         = x->e_mbd.mode_info_context->mbmi.txfm_size;
+      best_tx         = mic->mbmi.tx_size;
       *rate           = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
       *distortion     = this_distortion;
@@ -1431,7 +1261,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
-      for (i = 0; i < TX_MODES; i++) {
+      for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
         const int64_t adj_rd = this_rd + local_tx_cache[i] -
             local_tx_cache[cpi->common.tx_mode];
         if (adj_rd < tx_cache[i]) {
@@ -1441,61 +1271,78 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
-  x->e_mbd.mode_info_context->mbmi.txfm_size = best_tx;
+  mic->mbmi.mode = mode_selected;
+  mic->mbmi.tx_size = best_tx;
 
   return best_rd;
 }
 
-static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
-                                      int *rate, int64_t *distortion,
-                                      int *skippable, int64_t *sse,
-                                      BLOCK_SIZE_TYPE bsize,
-                                      TX_SIZE uv_tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int64_t dummy;
-  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
-    vp9_encode_intra_block_uv(cm, x, bsize);
-  else
-    vp9_xform_quant_sbuv(cm, x, bsize);
-
-  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2,
-                                 sse ? sse : &dummy);
-  *rate       = rdcost_uv(cm, x, bsize, uv_tx_size);
-  *skippable  = vp9_sbuv_is_skippable(xd, bsize);
-}
-
 static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
                              int *rate, int64_t *distortion, int *skippable,
-                             int64_t *sse, BLOCK_SIZE_TYPE bsize) {
+                             int64_t *sse, BLOCK_SIZE bsize,
+                             int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
   TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
+  int plane;
+  int pnrate = 0, pnskip = 1;
+  int64_t pndist = 0, pnsse = 0;
 
-  if (mbmi->ref_frame[0] > INTRA_FRAME)
+  if (ref_best_rd < 0)
+    goto term;
+
+  if (is_inter_block(mbmi))
     vp9_subtract_sbuv(x, bsize);
 
-  super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, sse, bsize,
-                            uv_txfm_size);
+  *rate = 0;
+  *distortion = 0;
+  *sse = 0;
+  *skippable = 1;
+
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
+                     ref_best_rd, plane, bsize, uv_txfm_size);
+    if (pnrate == INT_MAX)
+      goto term;
+    *rate += pnrate;
+    *distortion += pndist;
+    *sse += pnsse;
+    *skippable &= pnskip;
+  }
+  return;
+
+  term:
+  *rate = INT_MAX;
+  *distortion = INT64_MAX;
+  *sse = INT64_MAX;
+  *skippable = 0;
+  return;
 }
 
 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
-                                       BLOCK_SIZE_TYPE bsize) {
+                                       BLOCK_SIZE bsize) {
   MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  MB_PREDICTION_MODE mode_selected = DC_PRED;
   int64_t best_rd = INT64_MAX, this_rd;
   int this_rate_tokenonly, this_rate, s;
-  int64_t this_distortion;
+  int64_t this_distortion, this_sse;
+
+  // int mode_mask = (bsize <= BLOCK_8X8)
+  //                ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
+
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    // if (!(mode_mask & (1 << mode)))
+    if (!(cpi->sf.intra_uv_mode_mask & (1 << mode)))
+      continue;
 
-  MB_PREDICTION_MODE last_mode = bsize <= BLOCK_SIZE_SB8X8 ?
-              TM_PRED : cpi->sf.last_chroma_intra_mode;
+    x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
 
-  for (mode = DC_PRED; mode <= last_mode; mode++) {
-    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
     super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
-                     &this_distortion, &s, NULL, bsize);
+                     &this_distortion, &s, &this_sse, bsize, best_rd);
+    if (this_rate_tokenonly == INT_MAX)
+      continue;
     this_rate = this_rate_tokenonly +
                 x->intra_uv_mode_cost[cpi->common.frame_type][mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
@@ -1510,7 +1357,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
+  x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected;
 
   return best_rd;
 }
@@ -1518,12 +1365,13 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
 static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
                               int *rate, int *rate_tokenonly,
                               int64_t *distortion, int *skippable,
-                              BLOCK_SIZE_TYPE bsize) {
+                              BLOCK_SIZE bsize) {
   int64_t this_rd;
+  int64_t this_sse;
 
-  x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+  x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
   super_block_uvrd(&cpi->common, x, rate_tokenonly,
-                   distortion, skippable, NULL, bsize);
+                   distortion, skippable, &this_sse, bsize, INT64_MAX);
   *rate = *rate_tokenonly +
           x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
   this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
@@ -1531,7 +1379,7 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
   return this_rd;
 }
 
-static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                  int *rate_uv, int *rate_uv_tokenonly,
                                  int64_t *dist_uv, int *skip_uv,
                                  MB_PREDICTION_MODE *mode_uv) {
@@ -1541,27 +1389,25 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
   // appropriate speed flag is set.
   if (cpi->sf.use_uv_intra_rd_estimate) {
     rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                   (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
-                   bsize);
+                   bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   // Else do a proper rd search for each possible transform size that may
   // be considered in the main rd loop.
   } else {
     rd_pick_intra_sbuv_mode(cpi, x,
                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                            (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8
-                            : bsize);
+                            bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   }
-  *mode_uv = x->e_mbd.mode_info_context->mbmi.uv_mode;
+  *mode_uv = x->e_mbd.mi_8x8[0]->mbmi.uv_mode;
 }
 
 static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
                        int mode_context) {
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int segment_id = xd->this_mi->mbmi.segment_id;
 
   // Don't account for mode here if segment skip is enabled.
-  if (!vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) {
+  if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
     assert(is_inter_mode(mode));
     return x->inter_mode_cost[mode_context][mode - NEARESTMV];
   } else {
@@ -1570,18 +1416,18 @@ static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
 }
 
 void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
-  x->e_mbd.mode_info_context->mbmi.mode = mb;
-  x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
+  x->e_mbd.mi_8x8[0]->mbmi.mode = mb;
+  x->e_mbd.mi_8x8[0]->mbmi.mv[0].as_int = mv->as_int;
 }
 
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                BLOCK_SIZE_TYPE bsize,
+                                BLOCK_SIZE bsize,
                                 int_mv *frame_mv,
                                 int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
                                 int *rate_mv);
 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                 BLOCK_SIZE_TYPE bsize,
+                                 BLOCK_SIZE bsize,
                                  int mi_row, int mi_col,
                                  int_mv *tmp_mv, int *rate_mv);
 
@@ -1594,8 +1440,8 @@ static int labels2mode(MACROBLOCK *x, int i,
                        int_mv *second_best_ref_mv,
                        int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mode_info_context;
-  MB_MODE_INFO * mbmi = &mic->mbmi;
+  MODE_INFO *const mic = xd->this_mi;
+  MB_MODE_INFO *mbmi = &mic->mbmi;
   int cost = 0, thismvcost = 0;
   int idx, idy;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
@@ -1641,7 +1487,7 @@ static int labels2mode(MACROBLOCK *x, int i,
   }
 
   cost = cost_mv_ref(cpi, this_mode,
-                     mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+                     mbmi->mode_context[mbmi->ref_frame[0]]);
 
   mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
   if (mbmi->ref_frame[1] > 0)
@@ -1668,42 +1514,32 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   int k;
   MACROBLOCKD *xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
-  MODE_INFO *const mi = xd->mode_info_context;
-  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  MODE_INFO *const mi = xd->this_mi;
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
   const int width = plane_block_width(bsize, pd);
   const int height = plane_block_height(bsize, pd);
   int idx, idy;
   const int src_stride = x->plane[0].src.stride;
-  uint8_t* const src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+  uint8_t* const src = raster_block_offset_uint8(BLOCK_8X8, i,
                                                  x->plane[0].src.buf,
                                                  src_stride);
-  int16_t* src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
+  int16_t* src_diff = raster_block_offset_int16(BLOCK_8X8, i,
                                                 x->plane[0].src_diff);
-  int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
-  uint8_t* const pre = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                                 pd->pre[0].buf,
-                                                 pd->pre[0].stride);
-  uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                                 pd->dst.buf,
-                                                 pd->dst.stride);
+  int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, i);
+  uint8_t* const dst = raster_block_offset_uint8(BLOCK_8X8, i,
+                                                 pd->dst.buf, pd->dst.stride);
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0;
+  int ref, second_ref = has_second_ref(&mi->mbmi);
 
-  vp9_build_inter_predictor(pre, pd->pre[0].stride,
-                            dst, pd->dst.stride,
-                            &mi->bmi[i].as_mv[0].as_mv,
-                            &xd->scale_factor[0],
-                            width, height, 0, &xd->subpix, MV_PRECISION_Q3);
-
-  if (mi->mbmi.ref_frame[1] > 0) {
-    uint8_t* const second_pre =
-    raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                              pd->pre[1].buf, pd->pre[1].stride);
-    vp9_build_inter_predictor(second_pre, pd->pre[1].stride,
+  for (ref = 0; ref < 1 + second_ref; ++ref) {
+    const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i,
+                                     pd->pre[ref].buf, pd->pre[ref].stride);
+    vp9_build_inter_predictor(pre, pd->pre[ref].stride,
                               dst, pd->dst.stride,
-                              &mi->bmi[i].as_mv[1].as_mv,
-                              &xd->scale_factor[1],
-                              width, height, 1, &xd->subpix, MV_PRECISION_Q3);
+                              &mi->bmi[i].as_mv[ref].as_mv,
+                              &xd->scale_factor[ref],
+                              width, height, ref, &xd->subpix, MV_PRECISION_Q3);
   }
 
   vp9_subtract_block(height, width, src_diff, 8, src, src_stride,
@@ -1715,15 +1551,15 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
       int64_t ssz, rd, rd1, rd2;
 
       k += (idy * 2 + idx);
-      src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
+      src_diff = raster_block_offset_int16(BLOCK_8X8, k,
                                            x->plane[0].src_diff);
-      coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k);
+      coeff = BLOCK_OFFSET(x->plane[0].coeff, k);
       x->fwd_txm4x4(src_diff, coeff, 16);
       x->quantize_b_4x4(x, k, DCT_DCT, 16);
-      thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k, 16),
+      thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
       thissse += ssz;
-      thisrate += cost_coeffs(x, 0, k, PLANE_TYPE_Y_WITH_DC,
+      thisrate += cost_coeffs(x, 0, k,
                               ta + (k & 1),
                               tl + (k >> 1), TX_4X4,
                               vp9_default_scan_4x4,
@@ -1764,7 +1600,7 @@ typedef struct {
   int64_t sse;
   int segment_yrate;
   MB_PREDICTION_MODE modes[4];
-  SEG_RDSTAT rdstat[4][VP9_INTER_MODES];
+  SEG_RDSTAT rdstat[4][INTER_MODES];
   int mvthresh;
 } BEST_SEG_INFO;
 
@@ -1778,26 +1614,23 @@ static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
 }
 
 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
-  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
-  x->plane[0].src.buf =
-      raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
-                                x->plane[0].src.buf,
-                                x->plane[0].src.stride);
-  assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
-  x->e_mbd.plane[0].pre[0].buf =
-      raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
-                                x->e_mbd.plane[0].pre[0].buf,
-                                x->e_mbd.plane[0].pre[0].stride);
+  MB_MODE_INFO *const mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
+
+  p->src.buf = raster_block_offset_uint8(BLOCK_8X8, i, p->src.buf,
+                                         p->src.stride);
+  assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
+  pd->pre[0].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[0].buf,
+                                             pd->pre[0].stride);
   if (mbmi->ref_frame[1])
-    x->e_mbd.plane[0].pre[1].buf =
-        raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
-                                  x->e_mbd.plane[0].pre[1].buf,
-                                  x->e_mbd.plane[0].pre[1].stride);
+    pd->pre[1].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[1].buf,
+                                               pd->pre[1].stride);
 }
 
 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
                                   struct buf_2d orig_pre[2]) {
-  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
+  MB_MODE_INFO *mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
   x->plane[0].src = orig_src;
   x->e_mbd.plane[0].pre[0] = orig_pre[0];
   if (mbmi->ref_frame[1])
@@ -1811,13 +1644,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   int i, j, br = 0, idx, idy;
   int64_t bd = 0, block_sse = 0;
   MB_PREDICTION_MODE this_mode;
-  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  MODE_INFO *mi = x->e_mbd.mi_8x8[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const int label_count = 4;
   int64_t this_segment_rd = 0;
   int label_mv_thresh;
   int segmentyrate = 0;
-  BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   vp9_variance_fn_ptr_t *v_fn_ptr;
@@ -1874,7 +1707,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
             frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 &&
             (mbmi->ref_frame[1] <= 0 ||
              frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) {
-          int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]];
+          int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
           int c1 = cost_mv_ref(cpi, NEARMV, rfc);
           int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
           int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
@@ -1919,6 +1752,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           int thissme, bestsme = INT_MAX;
           int sadpb = x->sadperbit4;
           int_mv mvp_full;
+          int max_mv;
 
           /* Is the best so far sufficiently good that we cant justify doing
            * and new motion search. */
@@ -1928,40 +1762,58 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           if (cpi->compressor_speed) {
             // use previous block's result as next block's MV predictor.
             if (i > 0) {
-              bsi->mvp.as_int =
-              x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int;
+              bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
               if (i == 2)
-                bsi->mvp.as_int =
-                x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int;
+                bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
             }
           }
+          if (i == 0)
+            max_mv = x->max_mv_context[mbmi->ref_frame[0]];
+          else
+            max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
+
           if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
             // Take wtd average of the step_params based on the last frame's
             // max mv magnitude and the best ref mvs of the current block for
             // the given reference.
-            if (i == 0)
-              step_param = (vp9_init_search_range(
-                  cpi, x->max_mv_context[mbmi->ref_frame[0]]) +
-                  cpi->mv_step_param) >> 1;
-            else
-              step_param = (vp9_init_search_range(
-                  cpi, MAX(abs(bsi->mvp.as_mv.row),
-                           abs(bsi->mvp.as_mv.col)) >> 3) +
-                  cpi->mv_step_param) >> 1;
+            step_param = (vp9_init_search_range(cpi, max_mv) +
+                          cpi->mv_step_param) >> 1;
           } else {
             step_param = cpi->mv_step_param;
           }
 
-          further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-
           mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
           mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
 
+          if (cpi->sf.adaptive_motion_search && cpi->common.show_frame) {
+            mvp_full.as_mv.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3;
+            mvp_full.as_mv.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3;
+            step_param = MAX(step_param, 8);
+          }
+
+          further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
           // adjust src pointer for this block
           mi_buf_shift(x, i);
-          bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                           sadpb, further_steps, 0, v_fn_ptr,
-                                           bsi->ref_mv, &mode_mv[NEWMV]);
+          if (cpi->sf.search_method == HEX) {
+            bestsme = vp9_hex_search(x, &mvp_full,
+                                     step_param,
+                                     sadpb, 1, v_fn_ptr, 1,
+                                     bsi->ref_mv, &mode_mv[NEWMV]);
+          } else if (cpi->sf.search_method == SQUARE) {
+            bestsme = vp9_square_search(x, &mvp_full,
+                                        step_param,
+                                        sadpb, 1, v_fn_ptr, 1,
+                                        bsi->ref_mv, &mode_mv[NEWMV]);
+          } else if (cpi->sf.search_method == BIGDIA) {
+            bestsme = vp9_bigdia_search(x, &mvp_full,
+                                        step_param,
+                                        sadpb, 1, v_fn_ptr, 1,
+                                        bsi->ref_mv, &mode_mv[NEWMV]);
+          } else {
+            bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+                                             sadpb, further_steps, 0, v_fn_ptr,
+                                             bsi->ref_mv, &mode_mv[NEWMV]);
+          }
 
           // Should we do a full search (best quality only)
           if (cpi->compressor_speed == 0) {
@@ -1976,13 +1828,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
             if (thissme < bestsme) {
               bestsme = thissme;
-              mode_mv[NEWMV].as_int =
-                  x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int;
+              mode_mv[NEWMV].as_int = mi->bmi[i].as_mv[0].as_int;
             } else {
               /* The full search result is actually worse so re-instate the
                * previous best vector */
-              x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int =
-                  mode_mv[NEWMV].as_int;
+              mi->bmi[i].as_mv[0].as_int = mode_mv[NEWMV].as_int;
             }
           }
 
@@ -1991,19 +1841,23 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
             unsigned int sse;
             cpi->find_fractional_mv_step(x, &mode_mv[NEWMV],
                                          bsi->ref_mv, x->errorperbit, v_fn_ptr,
+                                         0, cpi->sf.subpel_iters_per_step,
                                          x->nmvjointcost, x->mvcost,
                                          &distortion, &sse);
 
-            // safe motion search result for use in compound prediction
+            // save motion search result for use in compound prediction
             seg_mvs[i][mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int;
           }
 
+          if (cpi->sf.adaptive_motion_search)
+            x->pred_mv[mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int;
+
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
         if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV &&
-            mbmi->interp_filter == vp9_switchable_interp[0]) {
+            mbmi->interp_filter == EIGHTTAP) {
           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
             continue;
@@ -2114,7 +1968,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
       if (best_rd == INT64_MAX) {
         int iy, midx;
         for (iy = i + 1; iy < 4; ++iy)
-          for (midx = 0; midx < VP9_INTER_MODES; ++midx)
+          for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
         return;
@@ -2138,7 +1992,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
       if (this_segment_rd > bsi->segment_rd) {
         int iy, midx;
         for (iy = i + 1; iy < 4; ++iy)
-          for (midx = 0; midx < VP9_INTER_MODES; ++midx)
+          for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
         return;
@@ -2182,7 +2036,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   int i;
   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
   MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mode_info_context;
+  MODE_INFO *mi = xd->this_mi;
   MB_MODE_INFO *mbmi = &mi->mbmi;
   int mode_idx;
 
@@ -2217,7 +2071,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   *returntotrate = bsi->r;
   *returndistortion = bsi->d;
   *returnyrate = bsi->segment_yrate;
-  *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8);
+  *skippable = vp9_is_skippable_in_plane(&x->e_mbd, BLOCK_8X8, 0);
   *psse = bsi->sse;
   mbmi->mode = bsi->modes[3];
 
@@ -2226,9 +2080,9 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
 
 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                     uint8_t *ref_y_buffer, int ref_y_stride,
-                    int ref_frame, BLOCK_SIZE_TYPE block_size ) {
+                    int ref_frame, BLOCK_SIZE block_size ) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   int_mv this_mv;
   int i;
   int zero_seen = 0;
@@ -2240,10 +2094,15 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
   int row_offset, col_offset;
+  int num_mv_refs = MAX_MV_REF_CANDIDATES +
+                    (cpi->sf.adaptive_motion_search &&
+                     cpi->common.show_frame &&
+                     block_size < cpi->sf.max_partition_size);
 
   // Get the sad for each candidate reference mv
-  for (i = 0; i < MAX_MV_REF_CANDIDATES; i++) {
-    this_mv.as_int = mbmi->ref_mvs[ref_frame][i].as_int;
+  for (i = 0; i < num_mv_refs; i++) {
+    this_mv.as_int = (i < MAX_MV_REF_CANDIDATES) ?
+        mbmi->ref_mvs[ref_frame][i].as_int : x->pred_mv[ref_frame].as_int;
 
     max_mv = MAX(max_mv,
                  MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
@@ -2279,7 +2138,7 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
                                      vp9_prob *comp_mode_p) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  int seg_ref_active = vp9_segfeature_active(&xd->seg, segment_id,
+  int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
                                              SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
     vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
@@ -2341,14 +2200,14 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int_mv *second_ref_mv,
                          int64_t comp_pred_diff[NB_PREDICTION_TYPES],
                          int64_t tx_size_diff[TX_MODES],
-                         int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]) {
+                         int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
   ctx->skip = x->skip;
   ctx->best_mode_index = mode_index;
-  ctx->mic = *xd->mode_info_context;
+  ctx->mic = *xd->this_mi;
 
   if (partition)
     ctx->partition_info = *partition;
@@ -2364,7 +2223,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   // doesn't actually work this way
   memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
   memcpy(ctx->best_filter_diff, best_filter_diff,
-         sizeof(*best_filter_diff) * (VP9_SWITCHABLE_FILTERS + 1));
+         sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1));
 }
 
 static void setup_pred_block(const MACROBLOCKD *xd,
@@ -2395,7 +2254,7 @@ static void setup_pred_block(const MACROBLOCKD *xd,
 
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                int idx, MV_REFERENCE_FRAME frame_type,
-                               BLOCK_SIZE_TYPE block_size,
+                               BLOCK_SIZE block_size,
                                int mi_row, int mi_col,
                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
                                int_mv frame_near_mv[MAX_REF_FRAMES],
@@ -2404,17 +2263,17 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
   VP9_COMMON *cm = &cpi->common;
   YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
 
   // set up scaling factors
   scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
 
   scale[frame_type].x_offset_q4 =
       ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].x_scale_fp,
-       VP9_REF_SCALE_SHIFT) & 0xf;
+       REF_SCALE_SHIFT) & 0xf;
   scale[frame_type].y_offset_q4 =
       ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].y_scale_fp,
-       VP9_REF_SCALE_SHIFT) & 0xf;
+       REF_SCALE_SHIFT) & 0xf;
 
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
@@ -2422,11 +2281,10 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                    &scale[frame_type], &scale[frame_type]);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  vp9_find_mv_refs(&cpi->common, xd, xd->mode_info_context,
-                   xd->prev_mode_info_context,
+  vp9_find_mv_refs(&cpi->common, xd, xd->this_mi,
+                   xd->last_mi,
                    frame_type,
-                   mbmi->ref_mvs[frame_type],
-                   cpi->common.ref_frame_sign_bias, mi_row, mi_col);
+                   mbmi->ref_mvs[frame_type], mi_row, mi_col);
 
   // Candidate refinement carried out at encoder and decoder
   vp9_find_best_ref_mvs(xd,
@@ -2437,8 +2295,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
   // Further refinement that is encode side only to test the top few candidates
   // in full and choose the best as the centre point for subsequent searches.
   // The current implementation doesn't support scaling.
-  if (scale[frame_type].x_scale_fp == VP9_REF_NO_SCALE &&
-      scale[frame_type].y_scale_fp == VP9_REF_NO_SCALE)
+  if (!vp9_is_scaled(&scale[frame_type]))
     mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
             frame_type, block_size);
 }
@@ -2446,27 +2303,27 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
 static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
   YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
   int fb = get_ref_frame_idx(cpi, ref_frame);
-  if (cpi->scaled_ref_idx[fb] != cpi->common.ref_frame_map[fb])
-    scaled_ref_frame = &cpi->common.yv12_fb[cpi->scaled_ref_idx[fb]];
+  int fb_scale = get_scale_ref_frame_idx(cpi, ref_frame);
+  if (cpi->scaled_ref_idx[fb_scale] != cpi->common.ref_frame_map[fb])
+    scaled_ref_frame = &cpi->common.yv12_fb[cpi->scaled_ref_idx[fb_scale]];
   return scaled_ref_frame;
 }
 
-static INLINE int get_switchable_rate(MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-
-  const int c = vp9_get_pred_context_switchable_interp(xd);
-  const int m = vp9_switchable_interp_map[mbmi->interp_filter];
-  return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
+static INLINE int get_switchable_rate(const MACROBLOCK *x) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+  const int ctx = vp9_get_pred_context_switchable_interp(xd);
+  return SWITCHABLE_INTERP_RATE_FACTOR *
+             x->switchable_interp_costs[ctx][mbmi->interp_filter];
 }
 
 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                 BLOCK_SIZE_TYPE bsize,
+                                 BLOCK_SIZE bsize,
                                  int mi_row, int mi_col,
                                  int_mv *tmp_mv, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   VP9_COMMON *cm = &cpi->common;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   int bestsme = INT_MAX;
   int further_steps, step_param;
@@ -2474,7 +2331,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int_mv mvp_full;
   int ref = mbmi->ref_frame[0];
   int_mv ref_mv = mbmi->ref_mvs[ref][0];
-  const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
 
   int tmp_col_min = x->mv_col_min;
   int tmp_col_max = x->mv_col_max;
@@ -2494,7 +2351,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
-  vp9_clamp_mv_min_max(x, &ref_mv);
+  vp9_clamp_mv_min_max(x, &ref_mv.as_mv);
 
   // Adjust search parameters based on small partitions' result.
   if (x->fast_ms) {
@@ -2506,7 +2363,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
       step_param = 8;
 
     // Get prediction MV.
-    mvp_full.as_int = x->pred_mv.as_int;
+    mvp_full.as_int = x->pred_mv[ref].as_int;
 
     // Adjust MV sign if needed.
     if (cm->ref_frame_sign_bias[ref]) {
@@ -2525,21 +2382,49 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     } else {
       step_param = cpi->mv_step_param;
     }
-    // mvp_full.as_int = ref_mv[0].as_int;
-    mvp_full.as_int =
-        mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int;
   }
 
+  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
+      cpi->common.show_frame) {
+    int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
+                                                       b_width_log2(bsize)));
+    step_param = MAX(step_param, boffset);
+  }
+
+  mvp_full.as_int = x->mv_best_ref_index[ref] < MAX_MV_REF_CANDIDATES ?
+      mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int :
+      x->pred_mv[ref].as_int;
+
   mvp_full.as_mv.col >>= 3;
   mvp_full.as_mv.row >>= 3;
 
   // Further step/diamond searches as necessary
   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
-  bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                   sadpb, further_steps, 1,
-                                   &cpi->fn_ptr[block_size],
-                                   &ref_mv, tmp_mv);
+  if (cpi->sf.search_method == HEX) {
+    bestsme = vp9_hex_search(x, &mvp_full,
+                             step_param,
+                             sadpb, 1,
+                             &cpi->fn_ptr[block_size], 1,
+                             &ref_mv, tmp_mv);
+  } else if (cpi->sf.search_method == SQUARE) {
+    bestsme = vp9_square_search(x, &mvp_full,
+                                step_param,
+                                sadpb, 1,
+                                &cpi->fn_ptr[block_size], 1,
+                                &ref_mv, tmp_mv);
+  } else if (cpi->sf.search_method == BIGDIA) {
+    bestsme = vp9_bigdia_search(x, &mvp_full,
+                                step_param,
+                                sadpb, 1,
+                                &cpi->fn_ptr[block_size], 1,
+                                &ref_mv, tmp_mv);
+  } else {
+    bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+                                     sadpb, further_steps, 1,
+                                     &cpi->fn_ptr[block_size],
+                                     &ref_mv, tmp_mv);
+  }
 
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
@@ -2547,17 +2432,22 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   x->mv_row_max = tmp_row_max;
 
   if (bestsme < INT_MAX) {
-    int dis; /* TODO: use dis in distortion calculation later. */
+    int dis;  /* TODO: use dis in distortion calculation later. */
     unsigned int sse;
     cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv,
                                  x->errorperbit,
                                  &cpi->fn_ptr[block_size],
+                                 0, cpi->sf.subpel_iters_per_step,
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &sse);
   }
   *rate_mv = vp9_mv_bit_cost(tmp_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost,
                              96);
+
+  if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
+    x->pred_mv[ref].as_int = tmp_mv->as_int;
+
   if (scaled_ref_frame) {
     int i;
     for (i = 0; i < MAX_MB_PLANE; i++)
@@ -2566,18 +2456,18 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 }
 
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                BLOCK_SIZE_TYPE bsize,
+                                BLOCK_SIZE bsize,
                                 int_mv *frame_mv,
                                 int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
                                 int *rate_mv) {
   int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   int refs[2] = { mbmi->ref_frame[0],
     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv ref_mv[2];
-  const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   int ite;
   // Prediction buffer from second frame.
   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
@@ -2653,7 +2543,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     // Compound motion search on first ref frame.
     if (id)
       xd->plane[0].pre[0] = ref_yv12[id];
-    vp9_clamp_mv_min_max(x, &ref_mv[id]);
+    vp9_clamp_mv_min_max(x, &ref_mv[id].as_mv);
 
     // Use mv result from single mode as mvp.
     tmp_mv.as_int = frame_mv[refs[id]].as_int;
@@ -2678,13 +2568,15 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
 
-      bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
-                                             &ref_mv[id],
-                                             x->errorperbit,
-                                             &cpi->fn_ptr[block_size],
-                                             x->nmvjointcost, x->mvcost,
-                                             &dis, &sse, second_pred,
-                                             pw, ph);
+      bestsme = cpi->find_fractional_mv_step_comp(
+          x, &tmp_mv,
+          &ref_mv[id],
+          x->errorperbit,
+          &cpi->fn_ptr[block_size],
+          0, cpi->sf.subpel_iters_per_step,
+          x->nmvjointcost, x->mvcost,
+          &dis, &sse, second_pred,
+          pw, ph);
     }
 
     if (id)
@@ -2721,7 +2613,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 }
 
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                 BLOCK_SIZE_TYPE bsize,
+                                 BLOCK_SIZE bsize,
                                  int64_t txfm_cache[],
                                  int *rate2, int64_t *distortion,
                                  int *skippable,
@@ -2732,10 +2624,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
                                  int mi_row, int mi_col,
                                  int_mv single_newmv[MAX_REF_FRAMES],
-                                 int64_t *psse, int64_t ref_best_rd) {
+                                 int64_t *psse,
+                                 const int64_t ref_best_rd) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   const int is_comp_pred = (mbmi->ref_frame[1] > 0);
   const int num_refs = is_comp_pred ? 2 : 1;
   const int this_mode = mbmi->mode;
@@ -2747,7 +2640,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t this_rd = 0;
   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
   int pred_exists = 0;
-  int interpolating_intpel_seen = 0;
   int intpel_mv;
   int64_t rd, best_rd = INT64_MAX;
   int best_needs_copy = 0;
@@ -2782,7 +2674,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
       *rate2 += rate_mv;
       frame_mv[refs[0]].as_int =
-          xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+          xd->this_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
       single_newmv[refs[0]].as_int = tmp_mv.as_int;
     }
   }
@@ -2790,9 +2682,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // if we're near/nearest and mv == 0,0, compare to zeromv
   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[refs[0]].as_int == 0 &&
-      !vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
+      !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
       (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) {
-    int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]];
+    int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
@@ -2849,7 +2741,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    * words if you present them in that order, the second one is always known
    * if the first is known */
   *rate2 += cost_mv_ref(cpi, this_mode,
-                        mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+                        mbmi->mode_context[mbmi->ref_frame[0]]);
 
   if (!(*mode_excluded)) {
     if (is_comp_pred) {
@@ -2860,7 +2752,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   pred_exists = 0;
-  interpolating_intpel_seen = 0;
   // Are all MVs integer pel for Y and UV
   intpel_mv = (mbmi->mv[0].as_mv.row & 15) == 0 &&
       (mbmi->mv[0].as_mv.col & 15) == 0;
@@ -2869,98 +2760,97 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         (mbmi->mv[1].as_mv.col & 15) == 0;
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
-  *best_filter = EIGHTTAP;
-  if (cpi->sf.use_8tap_always) {
+  if (cm->mcomp_filter_type != BILINEAR) {
     *best_filter = EIGHTTAP;
-    vp9_zero(cpi->rd_filter_cache);
-  } else {
-    int i, newbest;
-    int tmp_rate_sum = 0;
-    int64_t tmp_dist_sum = 0;
-
-    cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
-    for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-      int j;
-      int64_t rs_rd;
-      const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
-      const int is_intpel_interp = intpel_mv;
-      mbmi->interp_filter = filter;
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-      rs = get_switchable_rate(x);
-      rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
-
-      if (interpolating_intpel_seen && is_intpel_interp) {
-        cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
-                                         tmp_rate_sum, tmp_dist_sum);
-        cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
-            MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS],
-                cpi->rd_filter_cache[i] + rs_rd);
-        rd = cpi->rd_filter_cache[i];
-        if (cm->mcomp_filter_type == SWITCHABLE)
-          rd += rs_rd;
-      } else {
-        int rate_sum = 0;
-        int64_t dist_sum = 0;
-        if ((cm->mcomp_filter_type == SWITCHABLE &&
-             (!i || best_needs_copy)) ||
-            (cm->mcomp_filter_type != SWITCHABLE &&
-             (cm->mcomp_filter_type == mbmi->interp_filter ||
-              (!interpolating_intpel_seen && is_intpel_interp)))) {
-          for (j = 0; j < MAX_MB_PLANE; j++) {
-            xd->plane[j].dst.buf = orig_dst[j];
-            xd->plane[j].dst.stride = orig_dst_stride[j];
-          }
+    if (x->source_variance <
+        cpi->sf.disable_filter_search_var_thresh) {
+      *best_filter = EIGHTTAP;
+      vp9_zero(cpi->rd_filter_cache);
+    } else {
+      int i, newbest;
+      int tmp_rate_sum = 0;
+      int64_t tmp_dist_sum = 0;
+
+      cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        int j;
+        int64_t rs_rd;
+        mbmi->interp_filter = i;
+        vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+        rs = get_switchable_rate(x);
+        rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+
+        if (i > 0 && intpel_mv) {
+          cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
+                                           tmp_rate_sum, tmp_dist_sum);
+          cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
+              MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
+                  cpi->rd_filter_cache[i] + rs_rd);
+          rd = cpi->rd_filter_cache[i];
+          if (cm->mcomp_filter_type == SWITCHABLE)
+            rd += rs_rd;
         } else {
-          for (j = 0; j < MAX_MB_PLANE; j++) {
-            xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
-            xd->plane[j].dst.stride = 64;
+          int rate_sum = 0;
+          int64_t dist_sum = 0;
+          if ((cm->mcomp_filter_type == SWITCHABLE &&
+               (!i || best_needs_copy)) ||
+              (cm->mcomp_filter_type != SWITCHABLE &&
+               (cm->mcomp_filter_type == mbmi->interp_filter ||
+                (i == 0 && intpel_mv)))) {
+            for (j = 0; j < MAX_MB_PLANE; j++) {
+              xd->plane[j].dst.buf = orig_dst[j];
+              xd->plane[j].dst.stride = orig_dst_stride[j];
+            }
+          } else {
+            for (j = 0; j < MAX_MB_PLANE; j++) {
+              xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
+              xd->plane[j].dst.stride = 64;
+            }
+          }
+          vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
+          cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
+                                           rate_sum, dist_sum);
+          cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
+              MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
+                  cpi->rd_filter_cache[i] + rs_rd);
+          rd = cpi->rd_filter_cache[i];
+          if (cm->mcomp_filter_type == SWITCHABLE)
+            rd += rs_rd;
+          if (i == 0 && intpel_mv) {
+            tmp_rate_sum = rate_sum;
+            tmp_dist_sum = dist_sum;
           }
         }
-        vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-        model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
-        cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
-                                         rate_sum, dist_sum);
-        cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
-            MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS],
-                cpi->rd_filter_cache[i] + rs_rd);
-        rd = cpi->rd_filter_cache[i];
-        if (cm->mcomp_filter_type == SWITCHABLE)
-          rd += rs_rd;
-        if (!interpolating_intpel_seen && is_intpel_interp) {
-          tmp_rate_sum = rate_sum;
-          tmp_dist_sum = dist_sum;
-        }
-      }
-      if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
-        if (rd / 2 > ref_best_rd) {
-          for (i = 0; i < MAX_MB_PLANE; i++) {
-            xd->plane[i].dst.buf = orig_dst[i];
-            xd->plane[i].dst.stride = orig_dst_stride[i];
+        if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+          if (rd / 2 > ref_best_rd) {
+            for (i = 0; i < MAX_MB_PLANE; i++) {
+              xd->plane[i].dst.buf = orig_dst[i];
+              xd->plane[i].dst.stride = orig_dst_stride[i];
+            }
+            return INT64_MAX;
           }
-          return INT64_MAX;
         }
-      }
-      newbest = i == 0 || rd < best_rd;
-
-      if (newbest) {
-        best_rd = rd;
-        *best_filter = mbmi->interp_filter;
-        if (cm->mcomp_filter_type == SWITCHABLE && i &&
-            !(interpolating_intpel_seen && is_intpel_interp))
-          best_needs_copy = !best_needs_copy;
-      }
+        newbest = i == 0 || rd < best_rd;
+
+        if (newbest) {
+          best_rd = rd;
+          *best_filter = mbmi->interp_filter;
+          if (cm->mcomp_filter_type == SWITCHABLE && i && !intpel_mv)
+            best_needs_copy = !best_needs_copy;
+        }
 
-      if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
-          (cm->mcomp_filter_type != SWITCHABLE &&
-           cm->mcomp_filter_type == mbmi->interp_filter)) {
-        pred_exists = 1;
+        if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
+            (cm->mcomp_filter_type != SWITCHABLE &&
+             cm->mcomp_filter_type == mbmi->interp_filter)) {
+          pred_exists = 1;
+        }
       }
-      interpolating_intpel_seen |= is_intpel_interp;
-    }
 
-    for (i = 0; i < MAX_MB_PLANE; i++) {
-      xd->plane[i].dst.buf = orig_dst[i];
-      xd->plane[i].dst.stride = orig_dst_stride[i];
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = orig_dst[i];
+        xd->plane[i].dst.stride = orig_dst_stride[i];
+      }
     }
   }
   // Set the appropriate filter
@@ -3003,30 +2893,34 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->common.mcomp_filter_type == SWITCHABLE)
     *rate2 += get_switchable_rate(x);
 
-  if (!is_comp_pred) {
+  if (!is_comp_pred && cpi->enable_encode_breakout) {
     if (cpi->active_map_enabled && x->active_ptr[0] == 0)
       x->skip = 1;
     else if (x->encode_breakout) {
-      const BLOCK_SIZE_TYPE y_size = get_plane_block_size(bsize, &xd->plane[0]);
-      const BLOCK_SIZE_TYPE uv_size = get_plane_block_size(bsize,
-                                                           &xd->plane[1]);
+      const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
+      const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
       unsigned int var, sse;
       // Skipping threshold for ac.
       unsigned int thresh_ac;
       // The encode_breakout input
       unsigned int encode_breakout = x->encode_breakout << 4;
+      int max_thresh = 36000;
+
+      // Use extreme low threshold for static frames to limit skipping.
+      if (cpi->enable_encode_breakout == 2)
+        max_thresh = 128;
 
       // Calculate threshold according to dequant value.
       thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
 
-      // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
-      if (thresh_ac > 36000)
-        thresh_ac = 36000;
-
       // Use encode_breakout input if it is bigger than internal threshold.
       if (thresh_ac < encode_breakout)
         thresh_ac = encode_breakout;
 
+      // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
+      if (thresh_ac > max_thresh)
+        thresh_ac = max_thresh;
+
       var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
                                    xd->plane[0].dst.buf,
                                    xd->plane[0].dst.stride, &sse);
@@ -3065,8 +2959,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                 (sse_v - var_v < thresh_dc || sse_v == var_v)) {
               x->skip = 1;
 
-              *rate2 = 500;
-              *rate_uv = 0;
+              // The cost of skip bit needs to be added.
+              *rate2 += vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
 
               // Scaling factor for SSE from spatial domain to frequency domain
               // is 16. Adjust distortion accordingly.
@@ -3084,7 +2978,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   if (!x->skip) {
     int skippable_y, skippable_uv;
-    int64_t sseuv = INT_MAX;
+    int64_t sseuv = INT64_MAX;
+    int64_t rdcosty = INT64_MAX;
 
     // Y cost and distortion
     super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
@@ -3103,8 +2998,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     *rate2 += *rate_y;
     *distortion += *distortion_y;
 
-    super_block_uvrd(cm, x, rate_uv, distortion_uv,
-                     &skippable_uv, &sseuv, bsize);
+    rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
+    rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
+
+    super_block_uvrd(cm, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
+                     bsize, ref_best_rd - rdcosty);
+    if (*rate_uv == INT_MAX) {
+      *rate2 = INT_MAX;
+      *distortion = INT64_MAX;
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = orig_dst[i];
+        xd->plane[i].dst.stride = orig_dst_stride[i];
+      }
+      return INT64_MAX;
+    }
 
     *psse += sseuv;
     *rate2 += *rate_uv;
@@ -3122,17 +3029,17 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *returnrate, int64_t *returndist,
-                               BLOCK_SIZE_TYPE bsize,
+                               BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
-  int y_skip = 0, uv_skip;
+  int y_skip = 0, uv_skip = 0;
   int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
   x->skip_encode = 0;
   ctx->skip = 0;
-  xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME;
-  if (bsize >= BLOCK_SIZE_SB8X8) {
+  xd->this_mi->mbmi.ref_frame[0] = INTRA_FRAME;
+  if (bsize >= BLOCK_8X8) {
     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                &dist_y, &y_skip, bsize, tx_cache,
                                best_rd) >= best_rd) {
@@ -3149,46 +3056,46 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       return;
     }
     rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                            &dist_uv, &uv_skip, BLOCK_SIZE_SB8X8);
+                            &dist_uv, &uv_skip, BLOCK_8X8);
   }
 
   if (y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
                   vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
-    *returndist = dist_y + (dist_uv >> 2);
+    *returndist = dist_y + dist_uv;
     vp9_zero(ctx->tx_rd_diff);
   } else {
     int i;
     *returnrate = rate_y + rate_uv +
         vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
-    *returndist = dist_y + (dist_uv >> 2);
+    *returndist = dist_y + dist_uv;
     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
       for (i = 0; i < TX_MODES; i++)
         ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
   }
 
-  ctx->mic = *xd->mode_info_context;
+  ctx->mic = *xd->this_mi;
 }
 
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
                                   int *returnrate,
                                   int64_t *returndistortion,
-                                  BLOCK_SIZE_TYPE bsize,
+                                  BLOCK_SIZE bsize,
                                   PICK_MODE_CONTEXT *ctx,
                                   int64_t best_rd_so_far) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  const struct segmentation *seg = &xd->seg;
-  const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  MB_PREDICTION_MODE this_mode;
+  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+  const struct segmentation *seg = &cm->seg;
+  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  RD_PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
-  unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
+  unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  int_mv single_newmv[MAX_REF_FRAMES];
+  int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int idx_list[4] = {0,
@@ -3201,9 +3108,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_tx_diff[TX_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   int64_t best_pred_rd[NB_PREDICTION_TYPES];
-  int64_t best_filter_rd[VP9_SWITCHABLE_FILTERS + 1];
-  int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
-  MB_MODE_INFO best_mbmode;
+  int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+  MB_MODE_INFO best_mbmode = { 0 };
   int j;
   int mode_index, best_mode_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
@@ -3228,14 +3135,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   union b_mode_info best_bmodes[4];
   PARTITION_INFO best_partition;
-  int bwsl = b_width_log2(bsize);
-  int bws = (1 << bwsl) / 4;  // mode_info step for subsize
-  int bhsl = b_height_log2(bsize);
-  int bhs = (1 << bhsl) / 4;  // mode_info step for subsize
+  const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   int best_skip2 = 0;
 
-  x->skip_encode = (cpi->sf.skip_encode_frame &&
-                    xd->q_index < QIDX_SKIP_THRESH);
+  x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
 
   for (i = 0; i < 4; i++) {
     int j;
@@ -3248,14 +3152,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
-  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
-  vpx_memset(&single_newmv, 0, sizeof(single_newmv));
 
   for (i = 0; i < NB_PREDICTION_TYPES; ++i)
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_MODES; i++)
     best_tx_rd[i] = INT64_MAX;
-  for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
+  for (i = 0; i <= SWITCHABLE_FILTERS; i++)
     best_filter_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
@@ -3312,7 +3214,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     int compmode_cost = 0;
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
-    int skippable;
+    int skippable = 0;
     int64_t tx_cache[TX_MODES];
     int i;
     int this_skip2 = 0;
@@ -3327,10 +3229,31 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     ref_frame = vp9_mode_order[mode_index].ref_frame;
     second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
 
-    // Skip modes that have been masked off but always consider first mode.
-    if (mode_index && (bsize > cpi->sf.unused_mode_skip_lvl) &&
-         (cpi->unused_mode_skip_mask & (1 << mode_index)) )
-      continue;
+    // Look at the reference frame of the best mode so far and set the
+    // skip mask to look at a subset of the remaining modes.
+    if (mode_index > cpi->sf.mode_skip_start) {
+      if (mode_index == (cpi->sf.mode_skip_start + 1)) {
+        switch (vp9_mode_order[best_mode_index].ref_frame) {
+          case INTRA_FRAME:
+            cpi->mode_skip_mask = 0;
+            break;
+          case LAST_FRAME:
+            cpi->mode_skip_mask = LAST_FRAME_MODE_MASK;
+            break;
+          case GOLDEN_FRAME:
+            cpi->mode_skip_mask = GOLDEN_FRAME_MODE_MASK;
+            break;
+          case ALTREF_FRAME:
+            cpi->mode_skip_mask = ALT_REF_MODE_MASK;
+            break;
+          case NONE:
+          case MAX_REF_FRAMES:
+            assert(!"Invalid Reference frame");
+        }
+      }
+      if (cpi->mode_skip_mask & (1 << mode_index))
+        continue;
+    }
 
     // Skip if the current reference frame has been masked off
     if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
@@ -3339,7 +3262,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     // Test best rd so far against threshold for trying this mode.
     if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] *
-                     cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 4)) ||
+                     cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 5)) ||
         cpi->rd_threshes[bsize][mode_index] == INT_MAX)
       continue;
 
@@ -3355,7 +3278,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (x->fast_ms > 2 && ref_frame != x->subblock_ref)
       continue;
 
-    if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_SIZE_SB8X8) {
+    if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_8X8) {
       if (!(ref_frame_mask & (1 << ref_frame))) {
         continue;
       }
@@ -3393,19 +3316,24 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     // TODO(jingning, jkoleszar): scaling reference frame not supported for
     // SPLITMV.
     if (ref_frame > 0 &&
-        (scale_factor[ref_frame].x_scale_fp != VP9_REF_NO_SCALE ||
-         scale_factor[ref_frame].y_scale_fp != VP9_REF_NO_SCALE) &&
-        this_mode == SPLITMV)
+        vp9_is_scaled(&scale_factor[ref_frame]) &&
+        this_mode == RD_SPLITMV)
       continue;
 
     if (second_ref_frame > 0 &&
-        (scale_factor[second_ref_frame].x_scale_fp != VP9_REF_NO_SCALE ||
-         scale_factor[second_ref_frame].y_scale_fp != VP9_REF_NO_SCALE) &&
-        this_mode == SPLITMV)
+        vp9_is_scaled(&scale_factor[second_ref_frame]) &&
+        this_mode == RD_SPLITMV)
+      continue;
+
+    if (bsize >= BLOCK_8X8 &&
+        (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
+      continue;
+
+    if (bsize < BLOCK_8X8 &&
+        !(this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
       continue;
 
     set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
-    mbmi->mode = this_mode;
     mbmi->uv_mode = DC_PRED;
 
     // Evaluate all sub-pel filters irrespective of whether we can use
@@ -3413,13 +3341,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->interp_filter = cm->mcomp_filter_type;
     vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
-    if (bsize >= BLOCK_SIZE_SB8X8 &&
-        (this_mode == I4X4_PRED || this_mode == SPLITMV))
-      continue;
-    if (bsize < BLOCK_SIZE_SB8X8 &&
-        !(this_mode == I4X4_PRED || this_mode == SPLITMV))
-      continue;
-
     if (comp_pred) {
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
         continue;
@@ -3452,7 +3373,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     // If the segment skip feature is enabled....
     // then do nothing if the current mode is not allowed..
     } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
+               (this_mode != RD_ZEROMV && ref_frame != INTRA_FRAME)) {
       continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
@@ -3464,11 +3385,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       // an unfiltered alternative. We allow near/nearest as well
       // because they may result in zero-zero MVs but be cheaper.
       if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if ((this_mode != ZEROMV &&
-             !(this_mode == NEARMV &&
-               frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
-             !(this_mode == NEARESTMV &&
-               frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
+        if ((this_mode != RD_ZEROMV &&
+             !(this_mode == RD_NEARMV &&
+               frame_mv[RD_NEARMV][ALTREF_FRAME].as_int == 0) &&
+             !(this_mode == RD_NEARESTMV &&
+               frame_mv[RD_NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
             ref_frame != ALTREF_FRAME) {
           continue;
         }
@@ -3480,11 +3401,17 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     // a representative block in the boundary ( first ) and then implement a
     // function that does sads when inside the border..
     if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
-        this_mode == NEWMV) {
+        this_mode == RD_NEWMV) {
       continue;
     }
 
-    if (this_mode == I4X4_PRED) {
+#ifdef MODE_TEST_HIT_STATS
+    // TEST/DEBUG CODE
+    // Keep a rcord of the number of test hits at each size
+    cpi->mode_test_hits[bsize]++;
+#endif
+
+    if (this_mode == RD_I4X4_PRED) {
       int rate;
 
       /*
@@ -3493,8 +3420,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         continue;
         */
 
-      // I4X4_PRED is only considered for block sizes less than 8x8.
-      mbmi->txfm_size = TX_4X4;
+      // RD_I4X4_PRED is only considered for block sizes less than 8x8.
+      mbmi->tx_size = TX_4X4;
       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
                                        &distortion_y, best_rd) >= best_rd)
         continue;
@@ -3521,31 +3448,33 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       // Disable intra modes other than DC_PRED for blocks with low variance
       // Threshold for intra skipping based on source variance
       // TODO(debargha): Specialize the threshold for super block sizes
-      static const int skip_intra_var_thresh[BLOCK_SIZE_TYPES] = {
+      static const int skip_intra_var_thresh[BLOCK_SIZES] = {
         64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
       };
       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
-          this_mode != DC_PRED &&
+          this_mode != RD_DC_PRED &&
           x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
         continue;
       // Only search the oblique modes if the best so far is
       // one of the neighboring directional modes
       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-          (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+          (this_mode >= RD_D45_PRED && this_mode <= RD_TM_PRED)) {
         if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
           continue;
       }
+      mbmi->mode = rd_mode_to_mode(this_mode);
       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
         if (conditional_skipintra(mbmi->mode, best_intra_mode))
             continue;
       }
+
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
                       bsize, tx_cache, best_rd);
 
       if (rate_y == INT_MAX)
         continue;
 
-      uv_tx = MIN(mbmi->txfm_size, max_uv_txsize_lookup[bsize]);
+      uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]);
       if (rate_uv_intra[uv_tx] == INT_MAX) {
         choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx],
                              &rate_uv_tokenonly[uv_tx],
@@ -3559,10 +3488,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       mbmi->uv_mode = mode_uv[uv_tx];
 
       rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
-      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+      if (this_mode != RD_DC_PRED && this_mode != RD_TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
-    } else if (this_mode == SPLITMV) {
+    } else if (this_mode == RD_SPLITMV) {
       const int is_comp_pred = second_ref_frame > 0;
       int rate;
       int64_t distortion;
@@ -3577,7 +3506,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       union b_mode_info tmp_best_bmodes[16];
       MB_MODE_INFO tmp_best_mbmode;
       PARTITION_INFO tmp_best_partition;
-      BEST_SEG_INFO bsi[VP9_SWITCHABLE_FILTERS];
+      BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
       int pred_exists = 0;
       int uv_skippable;
       if (is_comp_pred) {
@@ -3595,70 +3524,79 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           cpi->rd_threshes[bsize][THR_NEWA];
       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
           cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
-      xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-
-      cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
-      for (switchable_filter_index = 0;
-           switchable_filter_index < VP9_SWITCHABLE_FILTERS;
-           ++switchable_filter_index) {
-        int newbest, rs;
-        int64_t rs_rd;
-        mbmi->interp_filter =
-            vp9_switchable_interp[switchable_filter_index];
-        vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
-                     &mbmi->ref_mvs[ref_frame][0],
-                     second_ref,
-                     best_yrd,
-                     &rate, &rate_y, &distortion,
-                     &skippable, &total_sse,
-                     (int)this_rd_thresh, seg_mvs,
-                     bsi, switchable_filter_index,
-                     mi_row, mi_col);
-
-        if (tmp_rd == INT64_MAX)
-          continue;
-        cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
-        rs = get_switchable_rate(x);
-        rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
-        cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
-            MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], tmp_rd + rs_rd);
-        if (cm->mcomp_filter_type == SWITCHABLE)
-          tmp_rd += rs_rd;
-
-        newbest = (tmp_rd < tmp_best_rd);
-        if (newbest) {
-          tmp_best_filter = mbmi->interp_filter;
-          tmp_best_rd = tmp_rd;
-        }
-        if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
-            (mbmi->interp_filter == cm->mcomp_filter_type &&
-             cm->mcomp_filter_type != SWITCHABLE)) {
-          tmp_best_rdu = tmp_rd;
-          tmp_best_rate = rate;
-          tmp_best_ratey = rate_y;
-          tmp_best_distortion = distortion;
-          tmp_best_sse = total_sse;
-          tmp_best_skippable = skippable;
-          tmp_best_mbmode = *mbmi;
-          tmp_best_partition = *x->partition_info;
-          for (i = 0; i < 4; i++)
-            tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
-          pred_exists = 1;
-          if (switchable_filter_index == 0 &&
-              cpi->sf.use_rd_breakout &&
-              best_rd < INT64_MAX) {
-            if (tmp_best_rdu / 2 > best_rd) {
-              // skip searching the other filters if the first is
-              // already substantially larger than the best so far
+      xd->this_mi->mbmi.tx_size = TX_4X4;
+
+      cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
+      if (cm->mcomp_filter_type != BILINEAR) {
+        tmp_best_filter = EIGHTTAP;
+        if (x->source_variance <
+            cpi->sf.disable_filter_search_var_thresh) {
+          tmp_best_filter = EIGHTTAP;
+          vp9_zero(cpi->rd_filter_cache);
+        } else {
+          for (switchable_filter_index = 0;
+               switchable_filter_index < SWITCHABLE_FILTERS;
+               ++switchable_filter_index) {
+            int newbest, rs;
+            int64_t rs_rd;
+            mbmi->interp_filter = switchable_filter_index;
+            vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+            tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+                                                 &mbmi->ref_mvs[ref_frame][0],
+                                                 second_ref,
+                                                 best_yrd,
+                                                 &rate, &rate_y, &distortion,
+                                                 &skippable, &total_sse,
+                                                 (int)this_rd_thresh, seg_mvs,
+                                                 bsi, switchable_filter_index,
+                                                 mi_row, mi_col);
+
+            if (tmp_rd == INT64_MAX)
+              continue;
+            cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
+            rs = get_switchable_rate(x);
+            rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+            cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
+                MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
+                    tmp_rd + rs_rd);
+            if (cm->mcomp_filter_type == SWITCHABLE)
+              tmp_rd += rs_rd;
+
+            newbest = (tmp_rd < tmp_best_rd);
+            if (newbest) {
               tmp_best_filter = mbmi->interp_filter;
-              tmp_best_rdu = INT64_MAX;
-              break;
+              tmp_best_rd = tmp_rd;
             }
-          }
+            if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
+                (mbmi->interp_filter == cm->mcomp_filter_type &&
+                 cm->mcomp_filter_type != SWITCHABLE)) {
+              tmp_best_rdu = tmp_rd;
+              tmp_best_rate = rate;
+              tmp_best_ratey = rate_y;
+              tmp_best_distortion = distortion;
+              tmp_best_sse = total_sse;
+              tmp_best_skippable = skippable;
+              tmp_best_mbmode = *mbmi;
+              tmp_best_partition = *x->partition_info;
+              for (i = 0; i < 4; i++)
+                tmp_best_bmodes[i] = xd->this_mi->bmi[i];
+              pred_exists = 1;
+              if (switchable_filter_index == 0 &&
+                  cpi->sf.use_rd_breakout &&
+                  best_rd < INT64_MAX) {
+                if (tmp_best_rdu / 2 > best_rd) {
+                  // skip searching the other filters if the first is
+                  // already substantially larger than the best so far
+                  tmp_best_filter = mbmi->interp_filter;
+                  tmp_best_rdu = INT64_MAX;
+                  break;
+                }
+              }
+            }
+          }  // switchable_filter_index loop
         }
-      }  // switchable_filter_index loop
+      }
 
       if (tmp_best_rdu == INT64_MAX)
         continue;
@@ -3694,7 +3632,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         *mbmi = tmp_best_mbmode;
         *x->partition_info = tmp_best_partition;
         for (i = 0; i < 4; i++)
-          xd->mode_info_context->bmi[i] = tmp_best_bmodes[i];
+          xd->this_mi->bmi[i] = tmp_best_bmodes[i];
       }
 
       rate2 += rate;
@@ -3711,16 +3649,19 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       }
       compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred);
 
-      if (RDCOST(x->rdmult, x->rddiv, rate2, distortion2) <
-          best_rd) {
+      tmp_best_rdu = best_rd -
+          MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
+              RDCOST(x->rdmult, x->rddiv, 0, total_sse));
+
+      if (tmp_best_rdu > 0) {
         // If even the 'Y' rd value of split is higher than best so far
         // then dont bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
-                                        BLOCK_SIZE_SB8X8);
-        vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
-        super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
-                                  &uv_skippable, &uv_sse,
-                                  BLOCK_SIZE_SB8X8, TX_4X4);
+                                        BLOCK_8X8);
+        super_block_uvrd(cm, x, &rate_uv, &distortion_uv, &uv_skippable,
+                         &uv_sse, BLOCK_8X8, tmp_best_rdu);
+        if (rate_uv == INT_MAX)
+          continue;
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
@@ -3731,6 +3672,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           tx_cache[i] = tx_cache[ONLY_4X4];
       }
     } else {
+      mbmi->mode = rd_mode_to_mode(this_mode);
       compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
       this_rd = handle_inter_mode(cpi, x, bsize,
                                   tx_cache,
@@ -3766,7 +3708,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
                                                          SEG_LVL_SKIP);
 
-      if (skippable && bsize >= BLOCK_SIZE_SB8X8) {
+      if (skippable && bsize >= BLOCK_8X8) {
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
         // for best yrd calculation
@@ -3815,30 +3757,30 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     // Keep record of best intra rd
-    if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME &&
-        is_intra_mode(xd->mode_info_context->mbmi.mode) &&
+    if (xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
+        is_intra_mode(xd->this_mi->mbmi.mode) &&
         this_rd < best_intra_rd) {
       best_intra_rd = this_rd;
-      best_intra_mode = xd->mode_info_context->mbmi.mode;
+      best_intra_mode = xd->this_mi->mbmi.mode;
     }
     // Keep record of best inter rd with single reference
-    if (xd->mode_info_context->mbmi.ref_frame[0] > INTRA_FRAME &&
-        xd->mode_info_context->mbmi.ref_frame[1] == NONE &&
+    if (xd->this_mi->mbmi.ref_frame[0] > INTRA_FRAME &&
+        xd->this_mi->mbmi.ref_frame[1] == NONE &&
         !mode_excluded &&
         this_rd < best_inter_rd) {
       best_inter_rd = this_rd;
       best_inter_ref_frame = ref_frame;
-      // best_inter_mode = xd->mode_info_context->mbmi.mode;
+      // best_inter_mode = xd->this_mi->mbmi.mode;
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-      for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
+      for (i = 0; i <= SWITCHABLE_FILTERS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
     }
 
-    if (this_mode != I4X4_PRED && this_mode != SPLITMV) {
+    if (this_mode != RD_I4X4_PRED && this_mode != RD_SPLITMV) {
       // Store the respective mode distortions for later use.
       if (mode_distortions[this_mode] == -1
           || distortion2 < mode_distortions[this_mode]) {
@@ -3870,9 +3812,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         best_skip2 = this_skip2;
         best_partition = *x->partition_info;
 
-        if (this_mode == I4X4_PRED || this_mode == SPLITMV)
+        if (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV)
           for (i = 0; i < 4; i++)
-            best_bmodes[i] = xd->mode_info_context->bmi[i];
+            best_bmodes[i] = xd->this_mi->bmi[i];
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
@@ -3890,29 +3832,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           }
         }
       }
-#if 0
-      // Testing this mode gave rise to an improvement in best error score.
-      // Lower threshold a bit for next time
-      cpi->rd_thresh_mult[mode_index] =
-          (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
-              cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-      cpi->rd_threshes[mode_index] =
-          (cpi->rd_baseline_thresh[mode_index] >> 7)
-              * cpi->rd_thresh_mult[mode_index];
-#endif
-    } else {
-      // If the mode did not help improve the best error case then
-      // raise the threshold for testing that mode next time around.
-#if 0
-      cpi->rd_thresh_mult[mode_index] += 4;
-
-      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
-
-      cpi->rd_threshes[mode_index] =
-          (cpi->rd_baseline_thresh[mode_index] >> 7)
-              * cpi->rd_thresh_mult[mode_index];
-#endif
     }
 
     /* keep record of best compound/single-only prediction */
@@ -3945,9 +3864,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
         cm->mcomp_filter_type != BILINEAR) {
       int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
-                              VP9_SWITCHABLE_FILTERS :
-                              vp9_switchable_interp_map[cm->mcomp_filter_type]];
-      for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+                              SWITCHABLE_FILTERS : cm->mcomp_filter_type];
+      for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
         int64_t adj_rd;
         // In cases of poor prediction, filter_cache[] can contain really big
         // values, which actually are bigger than this_rd itself. This can
@@ -3964,16 +3882,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     /* keep record of best txfm size */
     if (bsize < BLOCK_32X32) {
       if (bsize < BLOCK_16X16) {
-        if (this_mode == SPLITMV || this_mode == I4X4_PRED)
+        if (this_mode == RD_SPLITMV || this_mode == RD_I4X4_PRED)
           tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
       }
       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
     }
     if (!mode_excluded && this_rd != INT64_MAX) {
-      for (i = 0; i < TX_MODES; i++) {
+      for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
         int64_t adj_rd = INT64_MAX;
-        if (this_mode != I4X4_PRED) {
+        if (this_mode != RD_I4X4_PRED) {
           adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
         } else {
           adj_rd = this_rd;
@@ -4003,18 +3921,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                               &rate_uv_tokenonly[uv_tx_size],
                               &dist_uv[uv_tx_size],
                               &skip_uv[uv_tx_size],
-                              (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8
-                                                         : bsize);
+                              bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
     }
   }
 
-  // If indicated then mark the index of the chosen mode to be inspected at
-  // other block sizes.
-  if (bsize <= cpi->sf.unused_mode_skip_lvl) {
-    cpi->unused_mode_skip_mask = cpi->unused_mode_skip_mask &
-                                 (~((int64_t)1 << best_mode_index));
-  }
-
   // If we are using reference masking and the set mask flag is set then
   // create the reference frame mask.
   if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
@@ -4039,7 +3949,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  if (best_rd == INT64_MAX && bsize < BLOCK_SIZE_SB8X8) {
+  if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
     *returnrate = INT_MAX;
     *returndistortion = INT_MAX;
     return best_rd;
@@ -4057,57 +3967,43 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->sf.adaptive_rd_thresh) {
     for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
       if (mode_index == best_mode_index) {
-        cpi->rd_thresh_freq_fact[bsize][mode_index] = BASE_RD_THRESH_FREQ_FACT;
+        cpi->rd_thresh_freq_fact[bsize][mode_index] -=
+          (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
       } else {
-        cpi->rd_thresh_freq_fact[bsize][mode_index] += MAX_RD_THRESH_FREQ_INC;
+        cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
         if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
-            (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) {
+            (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) {
           cpi->rd_thresh_freq_fact[bsize][mode_index] =
-            cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT;
+            cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT;
         }
       }
     }
   }
 
-  // TODO(rbultje) integrate with RD trd_thresh_freq_facthresholding
-#if 0
-  // Reduce the activation RD thresholds for the best choice mode
-  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
-      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
-    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
-
-    cpi->rd_thresh_mult[best_mode_index] =
-      (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?
-      cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
-    cpi->rd_threshes[best_mode_index] =
-      (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
-  }
-#endif
-
   // macroblock modes
   *mbmi = best_mbmode;
   x->skip |= best_skip2;
   if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
-      best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
+      best_mbmode.sb_type < BLOCK_8X8) {
     for (i = 0; i < 4; i++)
-      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
+      xd->this_mi->bmi[i].as_mode = best_bmodes[i].as_mode;
   }
 
   if (best_mbmode.ref_frame[0] != INTRA_FRAME &&
-      best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
+      best_mbmode.sb_type < BLOCK_8X8) {
     for (i = 0; i < 4; i++)
-      xd->mode_info_context->bmi[i].as_mv[0].as_int =
+      xd->this_mi->bmi[i].as_mv[0].as_int =
           best_bmodes[i].as_mv[0].as_int;
 
     if (mbmi->ref_frame[1] > 0)
       for (i = 0; i < 4; i++)
-        xd->mode_info_context->bmi[i].as_mv[1].as_int =
+        xd->this_mi->bmi[i].as_mv[1].as_int =
             best_bmodes[i].as_mv[1].as_int;
 
     *x->partition_info = best_partition;
 
-    mbmi->mv[0].as_int = xd->mode_info_context->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = xd->mode_info_context->bmi[3].as_mv[1].as_int;
+    mbmi->mv[0].as_int = xd->this_mi->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int;
   }
 
   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
@@ -4118,14 +4014,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (!x->skip) {
-    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
       if (best_filter_rd[i] == INT64_MAX)
         best_filter_diff[i] = 0;
       else
         best_filter_diff[i] = best_rd - best_filter_rd[i];
     }
     if (cm->mcomp_filter_type == SWITCHABLE)
-      assert(best_filter_diff[VP9_SWITCHABLE_FILTERS] == 0);
+      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
   } else {
     vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff));
   }
diff --git a/libvpx/vp9/encoder/vp9_rdopt.h b/libvpx/vp9/encoder/vp9_rdopt.h
index 7c84b48..eba7df9 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/libvpx/vp9/encoder/vp9_rdopt.h
@@ -13,8 +13,6 @@
 #define VP9_ENCODER_VP9_RDOPT_H_
 
 #define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
-#define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
-
 #define QIDX_SKIP_THRESH     115
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
@@ -22,12 +20,12 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                               int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
+                               int *r, int64_t *d, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
-                                  int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
+                                  int *r, int64_t *d, BLOCK_SIZE bsize,
                                   PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
 void vp9_init_me_luts();
diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c
index 9564edc..10655e8 100644
--- a/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/libvpx/vp9/encoder/vp9_segmentation.c
@@ -17,39 +17,42 @@
 
 void vp9_enable_segmentation(VP9_PTR ptr) {
   VP9_COMP *cpi = (VP9_COMP *)ptr;
+  struct segmentation *const seg =  &cpi->common.seg;
 
-  cpi->mb.e_mbd.seg.enabled = 1;
-  cpi->mb.e_mbd.seg.update_map = 1;
-  cpi->mb.e_mbd.seg.update_data = 1;
+  seg->enabled = 1;
+  seg->update_map = 1;
+  seg->update_data = 1;
 }
 
 void vp9_disable_segmentation(VP9_PTR ptr) {
   VP9_COMP *cpi = (VP9_COMP *)ptr;
-  cpi->mb.e_mbd.seg.enabled = 0;
+  struct segmentation *const seg =  &cpi->common.seg;
+  seg->enabled = 0;
 }
 
 void vp9_set_segmentation_map(VP9_PTR ptr,
                               unsigned char *segmentation_map) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+  VP9_COMP *cpi = (VP9_COMP *)ptr;
+  struct segmentation *const seg = &cpi->common.seg;
 
   // Copy in the new segmentation map
   vpx_memcpy(cpi->segmentation_map, segmentation_map,
              (cpi->common.mi_rows * cpi->common.mi_cols));
 
   // Signal that the map should be updated.
-  cpi->mb.e_mbd.seg.update_map = 1;
-  cpi->mb.e_mbd.seg.update_data = 1;
+  seg->update_map = 1;
+  seg->update_data = 1;
 }
 
 void vp9_set_segment_data(VP9_PTR ptr,
                           signed char *feature_data,
                           unsigned char abs_delta) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+  VP9_COMP *cpi = (VP9_COMP *)ptr;
+  struct segmentation *const seg = &cpi->common.seg;
 
-  cpi->mb.e_mbd.seg.abs_delta = abs_delta;
+  seg->abs_delta = abs_delta;
 
-  vpx_memcpy(cpi->mb.e_mbd.seg.feature_data, feature_data,
-             sizeof(cpi->mb.e_mbd.seg.feature_data));
+  vpx_memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
 
   // TBD ?? Set the feature mask
   // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,
@@ -114,7 +117,7 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) {
   return cost;
 }
 
-static void count_segs(VP9_COMP *cpi, MODE_INFO *mi,
+static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
                        int *no_pred_segcounts,
                        int (*temporal_predictor_count)[2],
                        int *t_unpred_seg_counts,
@@ -126,8 +129,8 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO *mi,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  segment_id = mi->mbmi.segment_id;
-  xd->mode_info_context = mi;
+  segment_id = mi_8x8[0]->mbmi.segment_id;
+
   set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
 
   // Count the number of hits on each segment with no prediction
@@ -135,7 +138,7 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO *mi,
 
   // Temporal prediction not allowed on key frames
   if (cm->frame_type != KEY_FRAME) {
-    const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+    const BLOCK_SIZE bsize = mi_8x8[0]->mbmi.sb_type;
     // Test to see if the segment id matches the predicted value.
     const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
                                                    bsize, mi_row, mi_col);
@@ -144,7 +147,7 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO *mi,
 
     // Store the prediction status for this mb and update counts
     // as appropriate
-    vp9_set_pred_flag_seg_id(cm, bsize, mi_row, mi_col, pred_flag);
+    vp9_set_pred_flag_seg_id(xd, pred_flag);
     temporal_predictor_count[pred_context][pred_flag]++;
 
     if (!pred_flag)
@@ -153,95 +156,85 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO *mi,
   }
 }
 
-static void count_segs_sb(VP9_COMP *cpi, MODE_INFO *mi,
+static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
                           int *no_pred_segcounts,
                           int (*temporal_predictor_count)[2],
                           int *t_unpred_seg_counts,
                           int mi_row, int mi_col,
-                          BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
+                          BLOCK_SIZE bsize) {
+  const VP9_COMMON *const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
-  int bwl, bhl;
-  const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
+  int bw, bh;
+  const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  bwl = mi_width_log2(mi->mbmi.sb_type);
-  bhl = mi_height_log2(mi->mbmi.sb_type);
-
-  if (bwl == bsl && bhl == bsl) {
-    count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, 1 << bsl, 1 << bsl, mi_row, mi_col);
-  } else if (bwl == bsl && bhl < bsl) {
-    count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, 1 << bsl, bs, mi_row, mi_col);
-    count_segs(cpi, mi + bs * mis, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, 1 << bsl, bs, mi_row + bs, mi_col);
-  } else if (bwl < bsl && bhl == bsl) {
-    count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col);
-    count_segs(cpi, mi + bs, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col + bs);
+  bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type];
+  bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
+
+  if (bw == bs && bh == bs) {
+    count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+  } else if (bw == bs && bh < bs) {
+    count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+    count_segs(cpi, mi_8x8 + hbs * mis, no_pred_segcounts,
+               temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+               mi_row + hbs, mi_col);
+  } else if (bw < bs && bh == bs) {
+    count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+    count_segs(cpi, mi_8x8 + hbs, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs);
   } else {
-    BLOCK_SIZE_TYPE subsize;
+    const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
     int n;
 
-    assert(bwl < bsl && bhl < bsl);
-    if (bsize == BLOCK_64X64) {
-      subsize = BLOCK_32X32;
-    } else if (bsize == BLOCK_32X32) {
-      subsize = BLOCK_16X16;
-    } else {
-      assert(bsize == BLOCK_16X16);
-      subsize = BLOCK_8X8;
-    }
+    assert(bw < bs && bh < bs);
 
     for (n = 0; n < 4; n++) {
-      const int y_idx = n >> 1, x_idx = n & 0x01;
+      const int mi_dc = hbs * (n & 1);
+      const int mi_dr = hbs * (n >> 1);
 
-      count_segs_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,
+      count_segs_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc],
                     no_pred_segcounts, temporal_predictor_count,
                     t_unpred_seg_counts,
-                    mi_row + y_idx * bs, mi_col + x_idx * bs, subsize);
+                    mi_row + mi_dr, mi_col + mi_dc, subsize);
     }
   }
 }
 
 void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  struct segmentation *seg = &cpi->mb.e_mbd.seg;
+  struct segmentation *seg = &cm->seg;
 
   int no_pred_cost;
   int t_pred_cost = INT_MAX;
 
   int i, tile_col, mi_row, mi_col;
 
-  int temporal_predictor_count[PREDICTION_PROBS][2];
-  int no_pred_segcounts[MAX_SEGMENTS];
-  int t_unpred_seg_counts[MAX_SEGMENTS];
+  int temporal_predictor_count[PREDICTION_PROBS][2] = { { 0 } };
+  int no_pred_segcounts[MAX_SEGMENTS] = { 0 };
+  int t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
 
   vp9_prob no_pred_tree[SEG_TREE_PROBS];
   vp9_prob t_pred_tree[SEG_TREE_PROBS];
   vp9_prob t_nopred_prob[PREDICTION_PROBS];
 
   const int mis = cm->mode_info_stride;
-  MODE_INFO *mi_ptr, *mi;
+  MODE_INFO **mi_ptr, **mi;
 
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities
   vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
   vpx_memset(seg->pred_probs, 255, sizeof(seg->pred_probs));
 
-  vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));
-  vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));
-  vpx_memset(temporal_predictor_count, 0, sizeof(temporal_predictor_count));
-
   // First of all generate stats regarding how well the last segment map
   // predicts this one
   for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
     vp9_get_tile_col_offsets(cm, tile_col);
-    mi_ptr = cm->mi + cm->cur_tile_mi_col_start;
+    mi_ptr = cm->mi_grid_visible + cm->cur_tile_mi_col_start;
     for (mi_row = 0; mi_row < cm->mi_rows;
          mi_row += 8, mi_ptr += 8 * mis) {
       mi = mi_ptr;
diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c
index a692c01..63826ee 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -153,11 +153,11 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   /*cpi->sf.search_method == HEX*/
   // TODO Check that the 16x16 vf & sdf are selected here
   // Ignore mv costing by sending NULL pointer instead of cost arrays
-  ref_mv = &x->e_mbd.mode_info_context->bmi[0].as_mv[0];
-  bestsme = vp9_hex_search(x, &best_ref_mv1_full, ref_mv,
-                           step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16],
-                           NULL, NULL, NULL, NULL,
-                           &best_ref_mv1);
+  ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0];
+  bestsme = vp9_hex_search(x, &best_ref_mv1_full,
+                           step_param, sadpb, 1,
+                           &cpi->fn_ptr[BLOCK_16X16],
+                           0, &best_ref_mv1, ref_mv);
 
 #if ALT_REF_SUBPEL_ENABLED
   // Try sub-pixel MC?
@@ -170,6 +170,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                            &best_ref_mv1,
                                            x->errorperbit,
                                            &cpi->fn_ptr[BLOCK_16X16],
+                                           0, cpi->sf.subpel_iters_per_step,
                                            NULL, NULL,
                                            &distortion, &sse);
   }
@@ -244,8 +245,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
         if (cpi->frames[frame] == NULL)
           continue;
 
-        mbd->mode_info_context->bmi[0].as_mv[0].as_mv.row = 0;
-        mbd->mode_info_context->bmi[0].as_mv[0].as_mv.col = 0;
+        mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row = 0;
+        mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col = 0;
 
         if (frame == alt_ref_index) {
           filter_weight = 2;
@@ -278,8 +279,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
            cpi->frames[frame]->u_buffer + mb_uv_offset,
            cpi->frames[frame]->v_buffer + mb_uv_offset,
            cpi->frames[frame]->y_stride,
-           mbd->mode_info_context->bmi[0].as_mv[0].as_mv.row,
-           mbd->mode_info_context->bmi[0].as_mv[0].as_mv.col,
+           mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row,
+           mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col,
            predictor);
 
           // Apply the filter (YUV)
diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c
index caa89b2..0c9bf9d 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/libvpx/vp9/encoder/vp9_tokenize.c
@@ -97,103 +97,51 @@ struct tokenize_b_args {
   TX_SIZE tx_size;
 };
 
-static void set_entropy_context_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                                  int ss_txfrm_size, void *arg) {
+static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
+                                  TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
-  TX_SIZE tx_size = ss_txfrm_size >> 1;
-  MACROBLOCKD *xd = args->xd;
-  const int bwl = b_width_log2(bsize);
-  const int off = block >> (2 * tx_size);
-  const int mod = bwl - tx_size - xd->plane[plane].subsampling_x;
-  const int aoff = (off & ((1 << mod) - 1)) << tx_size;
-  const int loff = (off >> mod) << tx_size;
-  ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff;
-  ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff;
-  const int eob = xd->plane[plane].eobs[block];
-  const int tx_size_in_blocks = 1 << tx_size;
-
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    set_contexts_on_border(xd, bsize, plane, tx_size_in_blocks, eob, aoff, loff,
-                           A, L);
-  } else {
-    vpx_memset(A, eob > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
-    vpx_memset(L, eob > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
-  }
+  MACROBLOCKD *const xd = args->xd;
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  int aoff, loff;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
+  set_contexts(xd, pd, plane_bsize, tx_size, pd->eobs[block] > 0, aoff, loff);
 }
 
-static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                       int ss_txfrm_size, void *arg) {
+static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
+                       TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
   VP9_COMP *cpi = args->cpi;
   MACROBLOCKD *xd = args->xd;
   TOKENEXTRA **tp = args->tp;
-  const TX_SIZE tx_size = ss_txfrm_size >> 1;
-  const int tx_size_in_blocks = 1 << tx_size;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   int pt; /* near block/prev token context index */
   int c = 0, rc = 0;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
-  const int eob = xd->plane[plane].eobs[block];
-  const PLANE_TYPE type = xd->plane[plane].plane_type;
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
-  const int bwl = b_width_log2(bsize);
-  const int off = block >> (2 * tx_size);
-  const int mod = bwl - tx_size - xd->plane[plane].subsampling_x;
-  const int aoff = (off & ((1 << mod) - 1)) << tx_size;
-  const int loff = (off >> mod) << tx_size;
-  ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff;
-  ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff;
-  int seg_eob;
+  const int eob = pd->eobs[block];
+  const PLANE_TYPE type = pd->plane_type;
+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
+
   const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;
-  vp9_coeff_count *counts;
-  vp9_coeff_probs_model *coef_probs;
+  vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
+  vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
   const int ref = is_inter_block(mbmi);
-  ENTROPY_CONTEXT above_ec, left_ec;
   uint8_t token_cache[1024];
   const uint8_t *band_translate;
-  assert((!type && !plane) || (type && plane));
+  ENTROPY_CONTEXT *A, *L;
+  const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  int aoff, loff;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
 
-  counts = cpi->coef_counts[tx_size];
-  coef_probs = cpi->common.fc.coef_probs[tx_size];
-  switch (tx_size) {
-    default:
-    case TX_4X4:
-      above_ec = A[0] != 0;
-      left_ec = L[0] != 0;
-      seg_eob = 16;
-      scan = get_scan_4x4(get_tx_type_4x4(type, xd, block));
-      band_translate = vp9_coefband_trans_4x4;
-      break;
-    case TX_8X8:
-      above_ec = !!*(uint16_t *)A;
-      left_ec  = !!*(uint16_t *)L;
-      seg_eob = 64;
-      scan = get_scan_8x8(get_tx_type_8x8(type, xd));
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-    case TX_16X16:
-      above_ec = !!*(uint32_t *)A;
-      left_ec  = !!*(uint32_t *)L;
-      seg_eob = 256;
-      scan = get_scan_16x16(get_tx_type_16x16(type, xd));
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-    case TX_32X32:
-      above_ec = !!*(uint64_t *)A;
-      left_ec  = !!*(uint64_t *)L;
-      seg_eob = 1024;
-      scan = vp9_default_scan_32x32;
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-  }
-
-  pt = combine_entropy_contexts(above_ec, left_ec);
-  nb = vp9_get_coef_neighbors_handle(scan);
+  A = pd->above_context + aoff;
+  L = pd->left_context + loff;
 
-  if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP))
-    seg_eob = 0;
+  assert((!type && !plane) || (type && plane));
 
+  pt = get_entropy_context(xd, tx_size, type, block, A, L,
+                           &scan, &band_translate);
+  nb = vp9_get_coef_neighbors_handle(scan);
   c = 0;
   do {
     const int band = get_coef_band(band_translate, c);
@@ -227,62 +175,53 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
   } while (c < eob && ++c < seg_eob);
 
   *tp = t;
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    set_contexts_on_border(xd, bsize, plane, tx_size_in_blocks, c, aoff, loff,
-                           A, L);
-  } else {
-    vpx_memset(A, c > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
-    vpx_memset(L, c > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
-  }
+
+  set_contexts(xd, pd, plane_bsize, tx_size, c > 0, aoff, loff);
 }
 
 struct is_skippable_args {
   MACROBLOCKD *xd;
   int *skippable;
 };
+
 static void is_skippable(int plane, int block,
-                         BLOCK_SIZE_TYPE bsize, int ss_txfrm_size, void *argv) {
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                         void *argv) {
   struct is_skippable_args *args = argv;
   args->skippable[0] &= (!args->xd->plane[plane].eobs[block]);
 }
 
-int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
   int result = 1;
   struct is_skippable_args args = {xd, &result};
   foreach_transformed_block(xd, bsize, is_skippable, &args);
   return result;
 }
 
-int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
-  int result = 1;
-  struct is_skippable_args args = {xd, &result};
-  foreach_transformed_block_in_plane(xd, bsize, 0, is_skippable, &args);
-  return result;
-}
-
-int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                              int plane) {
   int result = 1;
   struct is_skippable_args args = {xd, &result};
-  foreach_transformed_block_uv(xd, bsize, is_skippable, &args);
+  foreach_transformed_block_in_plane(xd, bsize, plane, is_skippable, &args);
   return result;
 }
 
 void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
-                     BLOCK_SIZE_TYPE bsize) {
+                     BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
   TOKENEXTRA *t_backup = *t;
   const int mb_skip_context = vp9_get_pred_context_mbskip(xd);
-  const int skip_inc = !vp9_segfeature_active(&xd->seg, mbmi->segment_id,
+  const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
-  struct tokenize_b_args arg = {cpi, xd, t, mbmi->txfm_size};
+  struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size};
 
-  mbmi->mb_skip_coeff = vp9_sb_is_skippable(xd, bsize);
-  if (mbmi->mb_skip_coeff) {
+  mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize);
+  if (mbmi->skip_coeff) {
     if (!dry_run)
       cm->counts.mbskip[mb_skip_context][1] += skip_inc;
-    vp9_reset_sb_tokens_context(xd, bsize);
+    reset_skip_context(xd, bsize);
     if (dry_run)
       *t = t_backup;
     return;
diff --git a/libvpx/vp9/encoder/vp9_tokenize.h b/libvpx/vp9/encoder/vp9_tokenize.h
index 968bec7..b78e100 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/libvpx/vp9/encoder/vp9_tokenize.h
@@ -31,13 +31,13 @@ typedef struct {
 typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                                [MAX_ENTROPY_TOKENS + 1];
 
-int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
+int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize);
+int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                              int plane);
 struct VP9_COMP;
 
 void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
-                     BLOCK_SIZE_TYPE bsize);
+                     BLOCK_SIZE bsize);
 
 #ifdef ENTROPY_STATS
 void init_context_counters();
diff --git a/libvpx/vp9/encoder/vp9_variance_c.c b/libvpx/vp9/encoder/vp9_variance_c.c
index 23e7767..155ba8a 100644
--- a/libvpx/vp9/encoder/vp9_variance_c.c
+++ b/libvpx/vp9/encoder/vp9_variance_c.c
@@ -46,12 +46,12 @@ unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr,
                                            const uint8_t *dst_ptr,
                                            int dst_pixels_per_line,
                                            unsigned int *sse) {
-  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
   uint8_t temp2[68 * 64];
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 33, 64, hfilter);
@@ -68,13 +68,13 @@ unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
                                                int dst_pixels_per_line,
                                                unsigned int *sse,
                                                const uint8_t *second_pred) {
-  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
   uint8_t temp2[68 * 64];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 33, 64, hfilter);
@@ -103,12 +103,12 @@ unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr,
                                            const uint8_t *dst_ptr,
                                            int dst_pixels_per_line,
                                            unsigned int *sse) {
-  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
   uint8_t temp2[68 * 64];
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 65, 32, hfilter);
@@ -125,13 +125,13 @@ unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
                                                int dst_pixels_per_line,
                                                unsigned int *sse,
                                                const uint8_t *second_pred) {
-  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
   uint8_t temp2[68 * 64];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64);  // compound pred buffer
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 65, 32, hfilter);
@@ -160,12 +160,12 @@ unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr,
                                            const uint8_t *dst_ptr,
                                            int dst_pixels_per_line,
                                            unsigned int *sse) {
-  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
   uint8_t temp2[36 * 32];
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 17, 32, hfilter);
@@ -182,13 +182,13 @@ unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
                                                int dst_pixels_per_line,
                                                unsigned int *sse,
                                                const uint8_t *second_pred) {
-  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
   uint8_t temp2[36 * 32];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16);  // compound pred buffer
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 17, 32, hfilter);
@@ -217,12 +217,12 @@ unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr,
                                            const uint8_t *dst_ptr,
                                            int dst_pixels_per_line,
                                            unsigned int *sse) {
-  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
   uint8_t temp2[36 * 32];
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 33, 16, hfilter);
@@ -239,13 +239,13 @@ unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
                                                int dst_pixels_per_line,
                                                unsigned int *sse,
                                                const uint8_t *second_pred) {
-  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
   uint8_t temp2[36 * 32];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32);  // compound pred buffer
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 33, 16, hfilter);
@@ -440,10 +440,10 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
                                          unsigned int *sse) {
   uint8_t temp2[20 * 16];
   const int16_t *hfilter, *vfilter;
-  uint16_t fdata3[5 * 4];  // Temp data bufffer used in filtering
+  uint16_t fdata3[5 * 4];  // Temp data buffer used in filtering
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   // First filter 1d Horizontal
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
@@ -466,10 +466,10 @@ unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
   uint8_t temp2[20 * 16];
   const int16_t *hfilter, *vfilter;
   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4);  // compound pred buffer
-  uint16_t fdata3[5 * 4];  // Temp data bufffer used in filtering
+  uint16_t fdata3[5 * 4];  // Temp data buffer used in filtering
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   // First filter 1d Horizontal
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
@@ -488,12 +488,12 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
                                          const uint8_t *dst_ptr,
                                          int dst_pixels_per_line,
                                          unsigned int *sse) {
-  uint16_t fdata3[9 * 8];  // Temp data bufffer used in filtering
+  uint16_t fdata3[9 * 8];  // Temp data buffer used in filtering
   uint8_t temp2[20 * 16];
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 9, 8, hfilter);
@@ -510,13 +510,13 @@ unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
                                              int dst_pixels_per_line,
                                              unsigned int *sse,
                                              const uint8_t *second_pred) {
-  uint16_t fdata3[9 * 8];  // Temp data bufffer used in filtering
+  uint16_t fdata3[9 * 8];  // Temp data buffer used in filtering
   uint8_t temp2[20 * 16];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8);  // compound pred buffer
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 9, 8, hfilter);
@@ -532,12 +532,12 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
                                            const uint8_t *dst_ptr,
                                            int dst_pixels_per_line,
                                            unsigned int *sse) {
-  uint16_t fdata3[17 * 16];  // Temp data bufffer used in filtering
+  uint16_t fdata3[17 * 16];  // Temp data buffer used in filtering
   uint8_t temp2[20 * 16];
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 17, 16, hfilter);
@@ -559,8 +559,8 @@ unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16);  // compound pred buffer
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 17, 16, hfilter);
@@ -577,12 +577,12 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
                                            const uint8_t *dst_ptr,
                                            int dst_pixels_per_line,
                                            unsigned int *sse) {
-  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
   uint8_t temp2[68 * 64];
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 65, 64, hfilter);
@@ -599,13 +599,13 @@ unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
                                                int dst_pixels_per_line,
                                                unsigned int *sse,
                                                const uint8_t *second_pred) {
-  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
   uint8_t temp2[68 * 64];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 65, 64, hfilter);
@@ -621,12 +621,12 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
                                            const uint8_t *dst_ptr,
                                            int dst_pixels_per_line,
                                            unsigned int *sse) {
-  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
   uint8_t temp2[36 * 32];
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 33, 32, hfilter);
@@ -643,13 +643,13 @@ unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
                                                int dst_pixels_per_line,
                                                unsigned int *sse,
                                                const uint8_t *second_pred) {
-  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
   uint8_t temp2[36 * 32];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32);  // compound pred buffer
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 33, 32, hfilter);
@@ -785,12 +785,12 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
                                           const uint8_t *dst_ptr,
                                           int dst_pixels_per_line,
                                           unsigned int *sse) {
-  uint16_t fdata3[16 * 9];  // Temp data bufffer used in filtering
+  uint16_t fdata3[16 * 9];  // Temp data buffer used in filtering
   uint8_t temp2[20 * 16];
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 9, 16, hfilter);
@@ -807,13 +807,13 @@ unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
                                               int dst_pixels_per_line,
                                               unsigned int *sse,
                                               const uint8_t *second_pred) {
-  uint16_t fdata3[16 * 9];  // Temp data bufffer used in filtering
+  uint16_t fdata3[16 * 9];  // Temp data buffer used in filtering
   uint8_t temp2[20 * 16];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8);  // compound pred buffer
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 9, 16, hfilter);
@@ -829,12 +829,12 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
                                           const uint8_t *dst_ptr,
                                           int dst_pixels_per_line,
                                           unsigned int *sse) {
-  uint16_t fdata3[9 * 16];  // Temp data bufffer used in filtering
+  uint16_t fdata3[9 * 16];  // Temp data buffer used in filtering
   uint8_t temp2[20 * 16];
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 17, 8, hfilter);
@@ -851,13 +851,13 @@ unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
                                               int dst_pixels_per_line,
                                               unsigned int *sse,
                                               const uint8_t *second_pred) {
-  uint16_t fdata3[9 * 16];  // Temp data bufffer used in filtering
+  uint16_t fdata3[9 * 16];  // Temp data buffer used in filtering
   uint8_t temp2[20 * 16];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16);  // compound pred buffer
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 17, 8, hfilter);
@@ -873,12 +873,12 @@ unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr,
                                          const uint8_t *dst_ptr,
                                          int dst_pixels_per_line,
                                          unsigned int *sse) {
-  uint16_t fdata3[8 * 5];  // Temp data bufffer used in filtering
+  uint16_t fdata3[8 * 5];  // Temp data buffer used in filtering
   uint8_t temp2[20 * 16];
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 5, 8, hfilter);
@@ -895,13 +895,13 @@ unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr,
                                              int dst_pixels_per_line,
                                              unsigned int *sse,
                                              const uint8_t *second_pred) {
-  uint16_t fdata3[8 * 5];  // Temp data bufffer used in filtering
+  uint16_t fdata3[8 * 5];  // Temp data buffer used in filtering
   uint8_t temp2[20 * 16];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 4);  // compound pred buffer
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 5, 8, hfilter);
@@ -917,14 +917,14 @@ unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr,
                                          const uint8_t *dst_ptr,
                                          int dst_pixels_per_line,
                                          unsigned int *sse) {
-  uint16_t fdata3[5 * 8];  // Temp data bufffer used in filtering
+  uint16_t fdata3[5 * 8];  // Temp data buffer used in filtering
   // FIXME(jingning,rbultje): this temp2 buffer probably doesn't need to be
   // of this big? same issue appears in all other block size settings.
   uint8_t temp2[20 * 16];
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 9, 4, hfilter);
@@ -941,13 +941,13 @@ unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr,
                                              int dst_pixels_per_line,
                                              unsigned int *sse,
                                              const uint8_t *second_pred) {
-  uint16_t fdata3[5 * 8];  // Temp data bufffer used in filtering
+  uint16_t fdata3[5 * 8];  // Temp data buffer used in filtering
   uint8_t temp2[20 * 16];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 8);  // compound pred buffer
   const int16_t *hfilter, *vfilter;
 
-  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 9, 4, hfilter);
diff --git a/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
new file mode 100644
index 0000000..95ae266
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -0,0 +1,2650 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+#include "vp9/common/vp9_idct.h"  // for cospi constants
+#include "vpx_ports/mem.h"
+
+#if FDCT32x32_HIGH_PRECISION
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+  __m128i buf0, buf1;
+  buf0 = _mm_mul_epu32(a, b);
+  a = _mm_srli_epi64(a, 32);
+  b = _mm_srli_epi64(b, 32);
+  buf1 = _mm_mul_epu32(a, b);
+  return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_cvtlo_epi16(__m128i a, __m128i mask16, __m128i kZero) {
+  // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
+  __m128i sign_bit = _mm_and_si128(a, mask16);
+  __m128i b = _mm_unpacklo_epi16(a, kZero);
+  sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
+  sign_bit = _mm_unpacklo_epi16(kZero, sign_bit);
+  return _mm_or_si128(sign_bit, b);
+}
+
+static INLINE __m128i k_cvthi_epi16(__m128i a, __m128i mask16, __m128i kZero) {
+  // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
+  __m128i sign_bit = _mm_and_si128(a, mask16);
+  __m128i b = _mm_unpackhi_epi16(a, kZero);
+  sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
+  sign_bit = _mm_unpackhi_epi16(kZero, sign_bit);
+  return _mm_or_si128(sign_bit, b);
+}
+#endif
+
+void FDCT32x32_2D(int16_t *input,
+                  int16_t *output_org, int pitch) {
+  // Calculate pre-multiplied strides
+  const int str1 = pitch >> 1;
+  const int str2 = pitch;
+  const int str3 = pitch + str1;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64,   cospi_24_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64,  cospi_8_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64,  cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64,  cospi_12_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64,   cospi_28_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64,  cospi_4_64);
+  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64,  cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64,  cospi_18_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64,  cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64,   cospi_26_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64,  cospi_6_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64,  cospi_22_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64,  cospi_14_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64,   cospi_30_64);
+  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64,  cospi_1_64);
+  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64,  cospi_17_64);
+  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64,  cospi_9_64);
+  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64,   cospi_25_64);
+  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64,  cospi_7_64);
+  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64,   cospi_23_64);
+  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64,  cospi_15_64);
+  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64,   cospi_31_64);
+  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64,  cospi_5_64);
+  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64,  cospi_21_64);
+  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64,  cospi_13_64);
+  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64,   cospi_29_64);
+  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64,  cospi_3_64);
+  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64,  cospi_19_64);
+  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64,  cospi_11_64);
+  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64,   cospi_27_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kOne  = _mm_set1_epi16(1);
+  // Do the two transform/transpose passes
+  int pass;
+  for (pass = 0; pass < 2; ++pass) {
+    // We process eight columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 32; column_start += 8) {
+      __m128i step1[32];
+      __m128i step2[32];
+      __m128i step3[32];
+      __m128i out[32];
+      // Stage 1
+      // Note: even though all the loads below are aligned, using the aligned
+      //       intrinsic make the code slightly slower.
+      if (0 == pass) {
+        int16_t *in  = &input[column_start];
+        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          int16_t *ina =  in +  0 * str1;
+          int16_t *inb =  in + 31 * str1;
+          __m128i *step1a = &step1[ 0];
+          __m128i *step1b = &step1[31];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+        {
+          int16_t *ina =  in +  4 * str1;
+          int16_t *inb =  in + 27 * str1;
+          __m128i *step1a = &step1[ 4];
+          __m128i *step1b = &step1[27];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+        {
+          int16_t *ina =  in +  8 * str1;
+          int16_t *inb =  in + 23 * str1;
+          __m128i *step1a = &step1[ 8];
+          __m128i *step1b = &step1[23];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+        {
+          int16_t *ina =  in + 12 * str1;
+          int16_t *inb =  in + 19 * str1;
+          __m128i *step1a = &step1[12];
+          __m128i *step1b = &step1[19];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+      } else {
+        int16_t *in = &intermediate[column_start];
+        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
+        // Note: using the same approach as above to have common offset is
+        //       counter-productive as all offsets can be calculated at compile
+        //       time.
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          __m128i in00  = _mm_loadu_si128((const __m128i *)(in +  0 * 32));
+          __m128i in01  = _mm_loadu_si128((const __m128i *)(in +  1 * 32));
+          __m128i in02  = _mm_loadu_si128((const __m128i *)(in +  2 * 32));
+          __m128i in03  = _mm_loadu_si128((const __m128i *)(in +  3 * 32));
+          __m128i in28  = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
+          __m128i in29  = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
+          __m128i in30  = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
+          __m128i in31  = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
+          step1[ 0] = _mm_add_epi16(in00, in31);
+          step1[ 1] = _mm_add_epi16(in01, in30);
+          step1[ 2] = _mm_add_epi16(in02, in29);
+          step1[ 3] = _mm_add_epi16(in03, in28);
+          step1[28] = _mm_sub_epi16(in03, in28);
+          step1[29] = _mm_sub_epi16(in02, in29);
+          step1[30] = _mm_sub_epi16(in01, in30);
+          step1[31] = _mm_sub_epi16(in00, in31);
+        }
+        {
+          __m128i in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 32));
+          __m128i in05  = _mm_loadu_si128((const __m128i *)(in +  5 * 32));
+          __m128i in06  = _mm_loadu_si128((const __m128i *)(in +  6 * 32));
+          __m128i in07  = _mm_loadu_si128((const __m128i *)(in +  7 * 32));
+          __m128i in24  = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
+          __m128i in25  = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
+          __m128i in26  = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
+          __m128i in27  = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
+          step1[ 4] = _mm_add_epi16(in04, in27);
+          step1[ 5] = _mm_add_epi16(in05, in26);
+          step1[ 6] = _mm_add_epi16(in06, in25);
+          step1[ 7] = _mm_add_epi16(in07, in24);
+          step1[24] = _mm_sub_epi16(in07, in24);
+          step1[25] = _mm_sub_epi16(in06, in25);
+          step1[26] = _mm_sub_epi16(in05, in26);
+          step1[27] = _mm_sub_epi16(in04, in27);
+        }
+        {
+          __m128i in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 32));
+          __m128i in09  = _mm_loadu_si128((const __m128i *)(in +  9 * 32));
+          __m128i in10  = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
+          __m128i in11  = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
+          __m128i in20  = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
+          __m128i in21  = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
+          __m128i in22  = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
+          __m128i in23  = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
+          step1[ 8] = _mm_add_epi16(in08, in23);
+          step1[ 9] = _mm_add_epi16(in09, in22);
+          step1[10] = _mm_add_epi16(in10, in21);
+          step1[11] = _mm_add_epi16(in11, in20);
+          step1[20] = _mm_sub_epi16(in11, in20);
+          step1[21] = _mm_sub_epi16(in10, in21);
+          step1[22] = _mm_sub_epi16(in09, in22);
+          step1[23] = _mm_sub_epi16(in08, in23);
+        }
+        {
+          __m128i in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
+          __m128i in13  = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
+          __m128i in14  = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
+          __m128i in15  = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
+          __m128i in16  = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
+          __m128i in17  = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
+          __m128i in18  = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
+          __m128i in19  = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
+          step1[12] = _mm_add_epi16(in12, in19);
+          step1[13] = _mm_add_epi16(in13, in18);
+          step1[14] = _mm_add_epi16(in14, in17);
+          step1[15] = _mm_add_epi16(in15, in16);
+          step1[16] = _mm_sub_epi16(in15, in16);
+          step1[17] = _mm_sub_epi16(in14, in17);
+          step1[18] = _mm_sub_epi16(in13, in18);
+          step1[19] = _mm_sub_epi16(in12, in19);
+        }
+      }
+      // Stage 2
+      {
+        step2[ 0] = _mm_add_epi16(step1[0], step1[15]);
+        step2[ 1] = _mm_add_epi16(step1[1], step1[14]);
+        step2[ 2] = _mm_add_epi16(step1[2], step1[13]);
+        step2[ 3] = _mm_add_epi16(step1[3], step1[12]);
+        step2[ 4] = _mm_add_epi16(step1[4], step1[11]);
+        step2[ 5] = _mm_add_epi16(step1[5], step1[10]);
+        step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]);
+        step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]);
+        step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]);
+        step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]);
+        step2[10] = _mm_sub_epi16(step1[5], step1[10]);
+        step2[11] = _mm_sub_epi16(step1[4], step1[11]);
+        step2[12] = _mm_sub_epi16(step1[3], step1[12]);
+        step2[13] = _mm_sub_epi16(step1[2], step1[13]);
+        step2[14] = _mm_sub_epi16(step1[1], step1[14]);
+        step2[15] = _mm_sub_epi16(step1[0], step1[15]);
+      }
+      {
+        const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
+        const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
+        const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
+        const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
+        const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
+        const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
+        const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
+        const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
+        const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
+        const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
+        const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
+        const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
+        const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
+        const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
+        const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
+        const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
+        const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
+        const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
+        const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
+        const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
+        const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
+        const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
+        const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
+        const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
+        const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
+        const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
+        const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
+        const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
+        const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
+        const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
+        const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
+        const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
+        const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
+        const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
+        const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
+        const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
+        const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
+        const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
+        const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
+        // Combine
+        step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
+        step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
+        step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
+        step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
+        step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
+        step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
+        step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
+        step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+      }
+
+#if !FDCT32x32_HIGH_PRECISION
+      // dump the magnitude by half, hence the intermediate values are within
+      // the range of 16 bits.
+      if (1 == pass) {
+        __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero);
+        __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero);
+        __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero);
+        __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero);
+        __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero);
+        __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero);
+        __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero);
+        __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero);
+        __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
+        __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
+        __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
+        __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
+        __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
+        __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
+        __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+        __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+        __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
+        __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
+        __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
+        __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
+        __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
+        __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
+        __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
+        __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
+        __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
+        __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
+        __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
+        __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
+        __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
+        __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
+        __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
+        __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
+
+        step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0);
+        step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0);
+        step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0);
+        step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0);
+        step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0);
+        step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0);
+        step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0);
+        step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0);
+        step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
+        step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
+        step2[10] = _mm_sub_epi16(step2[10], s3_10_0);
+        step2[11] = _mm_sub_epi16(step2[11], s3_11_0);
+        step2[12] = _mm_sub_epi16(step2[12], s3_12_0);
+        step2[13] = _mm_sub_epi16(step2[13], s3_13_0);
+        step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
+        step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
+        step1[16] = _mm_sub_epi16(step1[16], s3_16_0);
+        step1[17] = _mm_sub_epi16(step1[17], s3_17_0);
+        step1[18] = _mm_sub_epi16(step1[18], s3_18_0);
+        step1[19] = _mm_sub_epi16(step1[19], s3_19_0);
+        step2[20] = _mm_sub_epi16(step2[20], s3_20_0);
+        step2[21] = _mm_sub_epi16(step2[21], s3_21_0);
+        step2[22] = _mm_sub_epi16(step2[22], s3_22_0);
+        step2[23] = _mm_sub_epi16(step2[23], s3_23_0);
+        step2[24] = _mm_sub_epi16(step2[24], s3_24_0);
+        step2[25] = _mm_sub_epi16(step2[25], s3_25_0);
+        step2[26] = _mm_sub_epi16(step2[26], s3_26_0);
+        step2[27] = _mm_sub_epi16(step2[27], s3_27_0);
+        step1[28] = _mm_sub_epi16(step1[28], s3_28_0);
+        step1[29] = _mm_sub_epi16(step1[29], s3_29_0);
+        step1[30] = _mm_sub_epi16(step1[30], s3_30_0);
+        step1[31] = _mm_sub_epi16(step1[31], s3_31_0);
+
+        step2[ 0] = _mm_add_epi16(step2[ 0], kOne);
+        step2[ 1] = _mm_add_epi16(step2[ 1], kOne);
+        step2[ 2] = _mm_add_epi16(step2[ 2], kOne);
+        step2[ 3] = _mm_add_epi16(step2[ 3], kOne);
+        step2[ 4] = _mm_add_epi16(step2[ 4], kOne);
+        step2[ 5] = _mm_add_epi16(step2[ 5], kOne);
+        step2[ 6] = _mm_add_epi16(step2[ 6], kOne);
+        step2[ 7] = _mm_add_epi16(step2[ 7], kOne);
+        step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
+        step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
+        step2[10] = _mm_add_epi16(step2[10], kOne);
+        step2[11] = _mm_add_epi16(step2[11], kOne);
+        step2[12] = _mm_add_epi16(step2[12], kOne);
+        step2[13] = _mm_add_epi16(step2[13], kOne);
+        step2[14] = _mm_add_epi16(step2[14], kOne);
+        step2[15] = _mm_add_epi16(step2[15], kOne);
+        step1[16] = _mm_add_epi16(step1[16], kOne);
+        step1[17] = _mm_add_epi16(step1[17], kOne);
+        step1[18] = _mm_add_epi16(step1[18], kOne);
+        step1[19] = _mm_add_epi16(step1[19], kOne);
+        step2[20] = _mm_add_epi16(step2[20], kOne);
+        step2[21] = _mm_add_epi16(step2[21], kOne);
+        step2[22] = _mm_add_epi16(step2[22], kOne);
+        step2[23] = _mm_add_epi16(step2[23], kOne);
+        step2[24] = _mm_add_epi16(step2[24], kOne);
+        step2[25] = _mm_add_epi16(step2[25], kOne);
+        step2[26] = _mm_add_epi16(step2[26], kOne);
+        step2[27] = _mm_add_epi16(step2[27], kOne);
+        step1[28] = _mm_add_epi16(step1[28], kOne);
+        step1[29] = _mm_add_epi16(step1[29], kOne);
+        step1[30] = _mm_add_epi16(step1[30], kOne);
+        step1[31] = _mm_add_epi16(step1[31], kOne);
+
+        step2[ 0] = _mm_srai_epi16(step2[ 0], 2);
+        step2[ 1] = _mm_srai_epi16(step2[ 1], 2);
+        step2[ 2] = _mm_srai_epi16(step2[ 2], 2);
+        step2[ 3] = _mm_srai_epi16(step2[ 3], 2);
+        step2[ 4] = _mm_srai_epi16(step2[ 4], 2);
+        step2[ 5] = _mm_srai_epi16(step2[ 5], 2);
+        step2[ 6] = _mm_srai_epi16(step2[ 6], 2);
+        step2[ 7] = _mm_srai_epi16(step2[ 7], 2);
+        step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
+        step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
+        step2[10] = _mm_srai_epi16(step2[10], 2);
+        step2[11] = _mm_srai_epi16(step2[11], 2);
+        step2[12] = _mm_srai_epi16(step2[12], 2);
+        step2[13] = _mm_srai_epi16(step2[13], 2);
+        step2[14] = _mm_srai_epi16(step2[14], 2);
+        step2[15] = _mm_srai_epi16(step2[15], 2);
+        step1[16] = _mm_srai_epi16(step1[16], 2);
+        step1[17] = _mm_srai_epi16(step1[17], 2);
+        step1[18] = _mm_srai_epi16(step1[18], 2);
+        step1[19] = _mm_srai_epi16(step1[19], 2);
+        step2[20] = _mm_srai_epi16(step2[20], 2);
+        step2[21] = _mm_srai_epi16(step2[21], 2);
+        step2[22] = _mm_srai_epi16(step2[22], 2);
+        step2[23] = _mm_srai_epi16(step2[23], 2);
+        step2[24] = _mm_srai_epi16(step2[24], 2);
+        step2[25] = _mm_srai_epi16(step2[25], 2);
+        step2[26] = _mm_srai_epi16(step2[26], 2);
+        step2[27] = _mm_srai_epi16(step2[27], 2);
+        step1[28] = _mm_srai_epi16(step1[28], 2);
+        step1[29] = _mm_srai_epi16(step1[29], 2);
+        step1[30] = _mm_srai_epi16(step1[30], 2);
+        step1[31] = _mm_srai_epi16(step1[31], 2);
+      }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+      if (pass == 0) {
+#endif
+      // Stage 3
+      {
+        step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
+        step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
+        step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
+        step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
+        step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
+        step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
+        step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
+        step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
+      }
+      {
+        const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+        const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+        const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+        const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+        const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+        const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+        const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+        const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+        const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+        const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+        const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+        const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+        const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+        const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+        const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+        const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+        const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+        const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+        const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        // Combine
+        step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+        step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+        step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+        step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+      }
+      {
+        step3[16] = _mm_add_epi16(step2[23], step1[16]);
+        step3[17] = _mm_add_epi16(step2[22], step1[17]);
+        step3[18] = _mm_add_epi16(step2[21], step1[18]);
+        step3[19] = _mm_add_epi16(step2[20], step1[19]);
+        step3[20] = _mm_sub_epi16(step1[19], step2[20]);
+        step3[21] = _mm_sub_epi16(step1[18], step2[21]);
+        step3[22] = _mm_sub_epi16(step1[17], step2[22]);
+        step3[23] = _mm_sub_epi16(step1[16], step2[23]);
+        step3[24] = _mm_sub_epi16(step1[31], step2[24]);
+        step3[25] = _mm_sub_epi16(step1[30], step2[25]);
+        step3[26] = _mm_sub_epi16(step1[29], step2[26]);
+        step3[27] = _mm_sub_epi16(step1[28], step2[27]);
+        step3[28] = _mm_add_epi16(step2[27], step1[28]);
+        step3[29] = _mm_add_epi16(step2[26], step1[29]);
+        step3[30] = _mm_add_epi16(step2[25], step1[30]);
+        step3[31] = _mm_add_epi16(step2[24], step1[31]);
+      }
+
+      // Stage 4
+      {
+        step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
+        step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]);
+        step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]);
+        step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]);
+        step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]);
+        step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]);
+        step1[10] = _mm_sub_epi16(step2[ 9], step3[10]);
+        step1[11] = _mm_sub_epi16(step2[ 8], step3[11]);
+        step1[12] = _mm_sub_epi16(step2[15], step3[12]);
+        step1[13] = _mm_sub_epi16(step2[14], step3[13]);
+        step1[14] = _mm_add_epi16(step3[13], step2[14]);
+        step1[15] = _mm_add_epi16(step3[12], step2[15]);
+      }
+      {
+        const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+        const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+        const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+        const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+        const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+        const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+        const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+        const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+        const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+        // Combine
+        step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+        step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+      }
+      {
+        const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+        const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+        const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+        const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+        const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+        const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+        const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+        const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+        const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+        const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+        const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+        const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+        const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+        const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+        const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+        const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+        const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+        const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+        const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+        const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+        const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+        const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+        const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+        const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+        const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+        const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+        const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+        const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+        const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+        const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+        const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+        const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+        const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+        const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+        const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+        const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+        const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+        const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+        const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+        // Combine
+        step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+        step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+        step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+        step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+        step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+        step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+        step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+        step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+      }
+      // Stage 5
+      {
+        step2[4] = _mm_add_epi16(step1[5], step3[4]);
+        step2[5] = _mm_sub_epi16(step3[4], step1[5]);
+        step2[6] = _mm_sub_epi16(step3[7], step1[6]);
+        step2[7] = _mm_add_epi16(step1[6], step3[7]);
+      }
+      {
+        const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+        const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+        const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+        const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+        const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+        const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+        const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+        const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+        const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+        const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+        const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+        const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+        const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+        const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+        const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+        const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+        const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+        const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+        const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+        // Combine
+        out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
+        out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+        out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
+        out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+      }
+      {
+        const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
+        const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
+        const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+        const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+        const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+        const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+        const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+        const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+        const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+        const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+        const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+        const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+        const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+        const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+        const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+        const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+        const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+        const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+        const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+        // Combine
+        step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+        step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+        step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+        step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+      }
+      {
+        step2[16] = _mm_add_epi16(step1[19], step3[16]);
+        step2[17] = _mm_add_epi16(step1[18], step3[17]);
+        step2[18] = _mm_sub_epi16(step3[17], step1[18]);
+        step2[19] = _mm_sub_epi16(step3[16], step1[19]);
+        step2[20] = _mm_sub_epi16(step3[23], step1[20]);
+        step2[21] = _mm_sub_epi16(step3[22], step1[21]);
+        step2[22] = _mm_add_epi16(step1[21], step3[22]);
+        step2[23] = _mm_add_epi16(step1[20], step3[23]);
+        step2[24] = _mm_add_epi16(step1[27], step3[24]);
+        step2[25] = _mm_add_epi16(step1[26], step3[25]);
+        step2[26] = _mm_sub_epi16(step3[25], step1[26]);
+        step2[27] = _mm_sub_epi16(step3[24], step1[27]);
+        step2[28] = _mm_sub_epi16(step3[31], step1[28]);
+        step2[29] = _mm_sub_epi16(step3[30], step1[29]);
+        step2[30] = _mm_add_epi16(step1[29], step3[30]);
+        step2[31] = _mm_add_epi16(step1[28], step3[31]);
+      }
+      // Stage 6
+      {
+        const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+        const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+        const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+        const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+        const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+        const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+        const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+        const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+        const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+        const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+        const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+        const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+        const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+        const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+        const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+        const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+        // dct_const_round_shift
+        const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+        const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+        const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+        const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+        const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+        const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+        const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+        const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+        // Combine
+        out[ 4] = _mm_packs_epi32(out_04_6, out_04_7);
+        out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+        out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+        out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+      }
+      {
+        step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]);
+        step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]);
+        step3[10] = _mm_sub_epi16(step1[11], step2[10]);
+        step3[11] = _mm_add_epi16(step2[10], step1[11]);
+        step3[12] = _mm_add_epi16(step2[13], step1[12]);
+        step3[13] = _mm_sub_epi16(step1[12], step2[13]);
+        step3[14] = _mm_sub_epi16(step1[15], step2[14]);
+        step3[15] = _mm_add_epi16(step2[14], step1[15]);
+      }
+      {
+        const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+        const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+        const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+        const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+        const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+        const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+        const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+        const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+        const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+        const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+        const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+        const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+        const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+        const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+        const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+        const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+        const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+        const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+        const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+        const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+        const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+        const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+        const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+        const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+        // dct_const_round_shift
+        const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+        const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+        const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+        const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+        const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+        const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+        const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+        const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+        const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+        const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+        const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+        const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+        const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+        const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+        const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+        const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+        // Combine
+        step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+        step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+        step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+        step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+        // Combine
+        step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+        step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+        step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+        step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+      }
+      // Stage 7
+      {
+        const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
+        const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
+        const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
+        const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
+        const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+        const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+        const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+        const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+        const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+        const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+        const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+        const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+        const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+        const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+        const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+        const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+        const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+        const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+        const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+        const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+        const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+        const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+        const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+        const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+        // dct_const_round_shift
+        const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+        const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+        const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+        const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+        const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+        const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+        const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+        const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+        const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+        const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+        const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+        const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+        const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+        const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+        const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+        const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+        // Combine
+        out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
+        out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+        out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+        out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+        out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
+        out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+        out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+        out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+      }
+      {
+        step1[16] = _mm_add_epi16(step3[17], step2[16]);
+        step1[17] = _mm_sub_epi16(step2[16], step3[17]);
+        step1[18] = _mm_sub_epi16(step2[19], step3[18]);
+        step1[19] = _mm_add_epi16(step3[18], step2[19]);
+        step1[20] = _mm_add_epi16(step3[21], step2[20]);
+        step1[21] = _mm_sub_epi16(step2[20], step3[21]);
+        step1[22] = _mm_sub_epi16(step2[23], step3[22]);
+        step1[23] = _mm_add_epi16(step3[22], step2[23]);
+        step1[24] = _mm_add_epi16(step3[25], step2[24]);
+        step1[25] = _mm_sub_epi16(step2[24], step3[25]);
+        step1[26] = _mm_sub_epi16(step2[27], step3[26]);
+        step1[27] = _mm_add_epi16(step3[26], step2[27]);
+        step1[28] = _mm_add_epi16(step3[29], step2[28]);
+        step1[29] = _mm_sub_epi16(step2[28], step3[29]);
+        step1[30] = _mm_sub_epi16(step2[31], step3[30]);
+        step1[31] = _mm_add_epi16(step3[30], step2[31]);
+      }
+      // Final stage --- outputs indices are bit-reversed.
+      {
+        const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+        const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+        const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+        const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+        const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+        const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+        const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+        const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+        const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+        const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+        const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+        const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+        const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+        const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+        const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+        const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+        const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+        const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+        const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+        const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+        const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+        const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+        const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+        const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+        // dct_const_round_shift
+        const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+        const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+        const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+        const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+        const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+        const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+        const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+        const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+        const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+        const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+        const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+        const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+        const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+        const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+        const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+        const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+        // Combine
+        out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
+        out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+        out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
+        out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+        out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
+        out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+        out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+        out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+      }
+      {
+        const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+        const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+        const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+        const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+        const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+        const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+        const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+        const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+        const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+        const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+        const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+        const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+        const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+        const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+        const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+        const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+        const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+        const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+        const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+        const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+        const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+        const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+        const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+        const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+        // dct_const_round_shift
+        const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+        const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+        const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+        const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+        const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+        const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+        const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+        const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+        const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+        const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+        const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+        const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+        const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+        const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+        const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+        const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+        // Combine
+        out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
+        out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+        out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+        out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+        out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
+        out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+        out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+        out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+      }
+#if FDCT32x32_HIGH_PRECISION
+      } else {
+        __m128i lstep1[64], lstep2[64], lstep3[64];
+        __m128i u[32], v[32], sign[16];
+        const __m128i mask16 = _mm_set1_epi32(0x80008000);
+        const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
+        // start using 32-bit operations
+        // stage 3
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero);
+          lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero);
+          lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero);
+          lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero);
+          lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero);
+          lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero);
+          lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero);
+          lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero);
+          lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero);
+          lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero);
+          lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero);
+          lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero);
+          lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero);
+          lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero);
+          lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero);
+          lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero);
+
+          lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
+          lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
+          lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
+          lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
+          lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
+          lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
+          lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
+          lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
+          lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
+          lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]);
+          lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]);
+          lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]);
+          lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]);
+          lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]);
+          lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]);
+          lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]);
+        }
+        {
+          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+          lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+          lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+          lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+          lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+          lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+          lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+          lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+          lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        }
+        {
+          lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero);
+          lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero);
+          lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero);
+          lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero);
+          lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero);
+          lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero);
+          lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero);
+          lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero);
+          lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero);
+          lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero);
+          lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero);
+          lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero);
+          lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero);
+          lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero);
+          lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero);
+          lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero);
+
+          lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero);
+          lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero);
+          lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero);
+          lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero);
+          lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero);
+          lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero);
+          lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero);
+          lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero);
+          lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero);
+          lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero);
+          lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero);
+          lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero);
+          lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero);
+          lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero);
+          lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero);
+          lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero);
+
+          lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
+          lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+          lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
+          lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
+          lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
+          lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
+          lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
+          lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
+          lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
+          lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
+          lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
+          lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
+          lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
+          lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
+          lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
+          lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
+          lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
+          lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
+          lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
+          lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
+          lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
+          lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
+          lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
+          lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
+          lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
+          lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
+          lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
+          lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
+          lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
+          lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
+          lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
+          lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
+        }
+
+        // stage 4
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero);
+          lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero);
+          lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero);
+          lstep2[19] = k_cvthi_epi16(step2[ 9], mask16, kZero);
+          lstep2[28] = k_cvtlo_epi16(step2[14], mask16, kZero);
+          lstep2[29] = k_cvthi_epi16(step2[14], mask16, kZero);
+          lstep2[30] = k_cvtlo_epi16(step2[15], mask16, kZero);
+          lstep2[31] = k_cvthi_epi16(step2[15], mask16, kZero);
+
+          lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
+          lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
+          lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]);
+          lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]);
+          lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]);
+          lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]);
+          lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]);
+          lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]);
+          lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
+          lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
+          lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
+          lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
+          lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
+          lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
+          lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
+          lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
+          lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
+          lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
+          lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
+          lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
+          lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
+          lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
+          lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
+          lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
+        }
+        {
+          // to be continued...
+          //
+          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+          u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+          u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+          u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+          u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+          // TODO(jingning): manually inline k_madd_epi32_ to further hide
+          // instruction latency.
+          v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
+          v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
+          v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
+          v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
+          v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
+          v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
+          v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
+          v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
+
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+          lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+        }
+        {
+          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
+          u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
+          u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
+          u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
+          u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
+          u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
+          u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
+          u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
+          u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
+          u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
+          u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
+          u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
+          u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
+          u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
+          u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
+          u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24);
+          v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24);
+          v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24);
+          v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24);
+          v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24);
+          v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24);
+          v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24);
+          v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24);
+          v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08);
+          v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08);
+          v[10] = k_madd_epi32(u[10], k32_m24_m08);
+          v[11] = k_madd_epi32(u[11], k32_m24_m08);
+          v[12] = k_madd_epi32(u[12], k32_m24_m08);
+          v[13] = k_madd_epi32(u[13], k32_m24_m08);
+          v[14] = k_madd_epi32(u[14], k32_m24_m08);
+          v[15] = k_madd_epi32(u[15], k32_m24_m08);
+          v[16] = k_madd_epi32(u[12], k32_m08_p24);
+          v[17] = k_madd_epi32(u[13], k32_m08_p24);
+          v[18] = k_madd_epi32(u[14], k32_m08_p24);
+          v[19] = k_madd_epi32(u[15], k32_m08_p24);
+          v[20] = k_madd_epi32(u[ 8], k32_m08_p24);
+          v[21] = k_madd_epi32(u[ 9], k32_m08_p24);
+          v[22] = k_madd_epi32(u[10], k32_m08_p24);
+          v[23] = k_madd_epi32(u[11], k32_m08_p24);
+          v[24] = k_madd_epi32(u[ 4], k32_p24_p08);
+          v[25] = k_madd_epi32(u[ 5], k32_p24_p08);
+          v[26] = k_madd_epi32(u[ 6], k32_p24_p08);
+          v[27] = k_madd_epi32(u[ 7], k32_p24_p08);
+          v[28] = k_madd_epi32(u[ 0], k32_p24_p08);
+          v[29] = k_madd_epi32(u[ 1], k32_p24_p08);
+          v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
+          v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
+
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 5
+        {
+          lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]);
+          lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]);
+          lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]);
+          lstep2[11] = _mm_sub_epi32(lstep3[ 9], lstep1[11]);
+          lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
+          lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
+          lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
+          lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
+        }
+        {
+          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+
+          u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
+          u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
+          u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
+          u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
+          u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
+          u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
+          u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
+          u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+          // TODO(jingning): manually inline k_madd_epi32_ to further hide
+          // instruction latency.
+          v[ 0] = k_madd_epi32(u[0], k32_p16_p16);
+          v[ 1] = k_madd_epi32(u[1], k32_p16_p16);
+          v[ 2] = k_madd_epi32(u[2], k32_p16_p16);
+          v[ 3] = k_madd_epi32(u[3], k32_p16_p16);
+          v[ 4] = k_madd_epi32(u[0], k32_p16_m16);
+          v[ 5] = k_madd_epi32(u[1], k32_p16_m16);
+          v[ 6] = k_madd_epi32(u[2], k32_p16_m16);
+          v[ 7] = k_madd_epi32(u[3], k32_p16_m16);
+          v[ 8] = k_madd_epi32(u[4], k32_p24_p08);
+          v[ 9] = k_madd_epi32(u[5], k32_p24_p08);
+          v[10] = k_madd_epi32(u[6], k32_p24_p08);
+          v[11] = k_madd_epi32(u[7], k32_p24_p08);
+          v[12] = k_madd_epi32(u[4], k32_m08_p24);
+          v[13] = k_madd_epi32(u[5], k32_m08_p24);
+          v[14] = k_madd_epi32(u[6], k32_m08_p24);
+          v[15] = k_madd_epi32(u[7], k32_m08_p24);
+
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm_cmplt_epi32(u[0], kZero);
+          sign[1] = _mm_cmplt_epi32(u[1], kZero);
+          sign[2] = _mm_cmplt_epi32(u[2], kZero);
+          sign[3] = _mm_cmplt_epi32(u[3], kZero);
+          sign[4] = _mm_cmplt_epi32(u[4], kZero);
+          sign[5] = _mm_cmplt_epi32(u[5], kZero);
+          sign[6] = _mm_cmplt_epi32(u[6], kZero);
+          sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+          u[0] = _mm_sub_epi32(u[0], sign[0]);
+          u[1] = _mm_sub_epi32(u[1], sign[1]);
+          u[2] = _mm_sub_epi32(u[2], sign[2]);
+          u[3] = _mm_sub_epi32(u[3], sign[3]);
+          u[4] = _mm_sub_epi32(u[4], sign[4]);
+          u[5] = _mm_sub_epi32(u[5], sign[5]);
+          u[6] = _mm_sub_epi32(u[6], sign[6]);
+          u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm_add_epi32(u[0], K32One);
+          u[1] = _mm_add_epi32(u[1], K32One);
+          u[2] = _mm_add_epi32(u[2], K32One);
+          u[3] = _mm_add_epi32(u[3], K32One);
+          u[4] = _mm_add_epi32(u[4], K32One);
+          u[5] = _mm_add_epi32(u[5], K32One);
+          u[6] = _mm_add_epi32(u[6], K32One);
+          u[7] = _mm_add_epi32(u[7], K32One);
+
+          u[0] = _mm_srai_epi32(u[0], 2);
+          u[1] = _mm_srai_epi32(u[1], 2);
+          u[2] = _mm_srai_epi32(u[2], 2);
+          u[3] = _mm_srai_epi32(u[3], 2);
+          u[4] = _mm_srai_epi32(u[4], 2);
+          u[5] = _mm_srai_epi32(u[5], 2);
+          u[6] = _mm_srai_epi32(u[6], 2);
+          u[7] = _mm_srai_epi32(u[7], 2);
+
+          // Combine
+          out[ 0] = _mm_packs_epi32(u[0], u[1]);
+          out[16] = _mm_packs_epi32(u[2], u[3]);
+          out[ 8] = _mm_packs_epi32(u[4], u[5]);
+          out[24] = _mm_packs_epi32(u[6], u[7]);
+        }
+        {
+          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
+          u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
+          u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
+          u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
+          u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
+          u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
+          u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
+          u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+          v[0] = k_madd_epi32(u[0], k32_m08_p24);
+          v[1] = k_madd_epi32(u[1], k32_m08_p24);
+          v[2] = k_madd_epi32(u[2], k32_m08_p24);
+          v[3] = k_madd_epi32(u[3], k32_m08_p24);
+          v[4] = k_madd_epi32(u[4], k32_m24_m08);
+          v[5] = k_madd_epi32(u[5], k32_m24_m08);
+          v[6] = k_madd_epi32(u[6], k32_m24_m08);
+          v[7] = k_madd_epi32(u[7], k32_m24_m08);
+          v[ 8] = k_madd_epi32(u[4], k32_m08_p24);
+          v[ 9] = k_madd_epi32(u[5], k32_m08_p24);
+          v[10] = k_madd_epi32(u[6], k32_m08_p24);
+          v[11] = k_madd_epi32(u[7], k32_m08_p24);
+          v[12] = k_madd_epi32(u[0], k32_p24_p08);
+          v[13] = k_madd_epi32(u[1], k32_p24_p08);
+          v[14] = k_madd_epi32(u[2], k32_p24_p08);
+          v[15] = k_madd_epi32(u[3], k32_p24_p08);
+
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+
+          u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+          lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+          lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+          lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+          lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+          lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+          lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+          lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+        }
+        {
+          lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
+          lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
+          lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
+          lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
+          lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
+          lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
+          lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
+          lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
+          lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
+          lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
+          lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
+          lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
+          lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
+          lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
+          lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
+          lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
+          lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
+          lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
+          lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
+          lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
+          lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
+          lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
+          lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
+          lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
+          lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
+          lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
+          lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
+          lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
+          lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
+          lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
+          lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
+          lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
+        }
+        // stage 6
+        {
+          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+
+          u[0] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[1] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[2] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[3] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+          u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[12] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[13] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[14] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[15] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+
+          v[0] = k_madd_epi32(u[0], k32_p28_p04);
+          v[1] = k_madd_epi32(u[1], k32_p28_p04);
+          v[2] = k_madd_epi32(u[2], k32_p28_p04);
+          v[3] = k_madd_epi32(u[3], k32_p28_p04);
+          v[4] = k_madd_epi32(u[4], k32_p12_p20);
+          v[5] = k_madd_epi32(u[5], k32_p12_p20);
+          v[6] = k_madd_epi32(u[6], k32_p12_p20);
+          v[7] = k_madd_epi32(u[7], k32_p12_p20);
+          v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32(u[12], k32_m04_p28);
+          v[13] = k_madd_epi32(u[13], k32_m04_p28);
+          v[14] = k_madd_epi32(u[14], k32_m04_p28);
+          v[15] = k_madd_epi32(u[15], k32_m04_p28);
+
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm_cmplt_epi32(u[0], kZero);
+          sign[1] = _mm_cmplt_epi32(u[1], kZero);
+          sign[2] = _mm_cmplt_epi32(u[2], kZero);
+          sign[3] = _mm_cmplt_epi32(u[3], kZero);
+          sign[4] = _mm_cmplt_epi32(u[4], kZero);
+          sign[5] = _mm_cmplt_epi32(u[5], kZero);
+          sign[6] = _mm_cmplt_epi32(u[6], kZero);
+          sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+          u[0] = _mm_sub_epi32(u[0], sign[0]);
+          u[1] = _mm_sub_epi32(u[1], sign[1]);
+          u[2] = _mm_sub_epi32(u[2], sign[2]);
+          u[3] = _mm_sub_epi32(u[3], sign[3]);
+          u[4] = _mm_sub_epi32(u[4], sign[4]);
+          u[5] = _mm_sub_epi32(u[5], sign[5]);
+          u[6] = _mm_sub_epi32(u[6], sign[6]);
+          u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm_add_epi32(u[0], K32One);
+          u[1] = _mm_add_epi32(u[1], K32One);
+          u[2] = _mm_add_epi32(u[2], K32One);
+          u[3] = _mm_add_epi32(u[3], K32One);
+          u[4] = _mm_add_epi32(u[4], K32One);
+          u[5] = _mm_add_epi32(u[5], K32One);
+          u[6] = _mm_add_epi32(u[6], K32One);
+          u[7] = _mm_add_epi32(u[7], K32One);
+
+          u[0] = _mm_srai_epi32(u[0], 2);
+          u[1] = _mm_srai_epi32(u[1], 2);
+          u[2] = _mm_srai_epi32(u[2], 2);
+          u[3] = _mm_srai_epi32(u[3], 2);
+          u[4] = _mm_srai_epi32(u[4], 2);
+          u[5] = _mm_srai_epi32(u[5], 2);
+          u[6] = _mm_srai_epi32(u[6], 2);
+          u[7] = _mm_srai_epi32(u[7], 2);
+
+          out[ 4] = _mm_packs_epi32(u[0], u[1]);
+          out[20] = _mm_packs_epi32(u[2], u[3]);
+          out[12] = _mm_packs_epi32(u[4], u[5]);
+          out[28] = _mm_packs_epi32(u[6], u[7]);
+        }
+        {
+          lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
+          lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
+          lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
+          lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
+          lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
+          lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
+          lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
+          lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
+          lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
+          lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
+          lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
+          lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
+          lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
+          lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
+          lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
+          lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
+        }
+        {
+          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+          const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
+          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m128i k32_m12_m20 = pair_set_epi32(-cospi_12_64,
+                                                     -cospi_20_64);
+          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
+          u[ 1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
+          u[ 2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
+          u[ 3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
+          u[ 4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
+          u[ 5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
+          u[ 6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
+          u[ 7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
+          u[ 8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
+          u[ 9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
+          u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
+          u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
+          u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
+          u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
+          u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
+          u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_m04_p28);
+          v[ 1] = k_madd_epi32(u[ 1], k32_m04_p28);
+          v[ 2] = k_madd_epi32(u[ 2], k32_m04_p28);
+          v[ 3] = k_madd_epi32(u[ 3], k32_m04_p28);
+          v[ 4] = k_madd_epi32(u[ 4], k32_m28_m04);
+          v[ 5] = k_madd_epi32(u[ 5], k32_m28_m04);
+          v[ 6] = k_madd_epi32(u[ 6], k32_m28_m04);
+          v[ 7] = k_madd_epi32(u[ 7], k32_m28_m04);
+          v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32(u[12], k32_m12_m20);
+          v[13] = k_madd_epi32(u[13], k32_m12_m20);
+          v[14] = k_madd_epi32(u[14], k32_m12_m20);
+          v[15] = k_madd_epi32(u[15], k32_m12_m20);
+          v[16] = k_madd_epi32(u[12], k32_m20_p12);
+          v[17] = k_madd_epi32(u[13], k32_m20_p12);
+          v[18] = k_madd_epi32(u[14], k32_m20_p12);
+          v[19] = k_madd_epi32(u[15], k32_m20_p12);
+          v[20] = k_madd_epi32(u[ 8], k32_p12_p20);
+          v[21] = k_madd_epi32(u[ 9], k32_p12_p20);
+          v[22] = k_madd_epi32(u[10], k32_p12_p20);
+          v[23] = k_madd_epi32(u[11], k32_p12_p20);
+          v[24] = k_madd_epi32(u[ 4], k32_m04_p28);
+          v[25] = k_madd_epi32(u[ 5], k32_m04_p28);
+          v[26] = k_madd_epi32(u[ 6], k32_m04_p28);
+          v[27] = k_madd_epi32(u[ 7], k32_m04_p28);
+          v[28] = k_madd_epi32(u[ 0], k32_p28_p04);
+          v[29] = k_madd_epi32(u[ 1], k32_p28_p04);
+          v[30] = k_madd_epi32(u[ 2], k32_p28_p04);
+          v[31] = k_madd_epi32(u[ 3], k32_p28_p04);
+
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep3[34] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep3[35] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep3[36] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep3[37] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep3[42] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep3[43] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep3[44] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep3[45] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep3[50] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep3[51] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 7
+        {
+          const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
+          const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
+          const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
+          const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64,  cospi_26_64);
+          const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
+          const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
+          const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
+          const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
+          u[ 1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
+          u[ 2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
+          u[ 3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
+          u[ 4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
+          u[ 5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
+          u[ 6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
+          u[ 7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
+          u[ 8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
+          u[ 9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
+          u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
+          u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
+          u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
+          u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
+          u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
+          u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_p30_p02);
+          v[ 1] = k_madd_epi32(u[ 1], k32_p30_p02);
+          v[ 2] = k_madd_epi32(u[ 2], k32_p30_p02);
+          v[ 3] = k_madd_epi32(u[ 3], k32_p30_p02);
+          v[ 4] = k_madd_epi32(u[ 4], k32_p14_p18);
+          v[ 5] = k_madd_epi32(u[ 5], k32_p14_p18);
+          v[ 6] = k_madd_epi32(u[ 6], k32_p14_p18);
+          v[ 7] = k_madd_epi32(u[ 7], k32_p14_p18);
+          v[ 8] = k_madd_epi32(u[ 8], k32_p22_p10);
+          v[ 9] = k_madd_epi32(u[ 9], k32_p22_p10);
+          v[10] = k_madd_epi32(u[10], k32_p22_p10);
+          v[11] = k_madd_epi32(u[11], k32_p22_p10);
+          v[12] = k_madd_epi32(u[12], k32_p06_p26);
+          v[13] = k_madd_epi32(u[13], k32_p06_p26);
+          v[14] = k_madd_epi32(u[14], k32_p06_p26);
+          v[15] = k_madd_epi32(u[15], k32_p06_p26);
+          v[16] = k_madd_epi32(u[12], k32_m26_p06);
+          v[17] = k_madd_epi32(u[13], k32_m26_p06);
+          v[18] = k_madd_epi32(u[14], k32_m26_p06);
+          v[19] = k_madd_epi32(u[15], k32_m26_p06);
+          v[20] = k_madd_epi32(u[ 8], k32_m10_p22);
+          v[21] = k_madd_epi32(u[ 9], k32_m10_p22);
+          v[22] = k_madd_epi32(u[10], k32_m10_p22);
+          v[23] = k_madd_epi32(u[11], k32_m10_p22);
+          v[24] = k_madd_epi32(u[ 4], k32_m18_p14);
+          v[25] = k_madd_epi32(u[ 5], k32_m18_p14);
+          v[26] = k_madd_epi32(u[ 6], k32_m18_p14);
+          v[27] = k_madd_epi32(u[ 7], k32_m18_p14);
+          v[28] = k_madd_epi32(u[ 0], k32_m02_p30);
+          v[29] = k_madd_epi32(u[ 1], k32_m02_p30);
+          v[30] = k_madd_epi32(u[ 2], k32_m02_p30);
+          v[31] = k_madd_epi32(u[ 3], k32_m02_p30);
+
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+          v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+          v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+          v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+          v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+          v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+          v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+          v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+          v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+          v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+          v[10] = _mm_cmplt_epi32(u[10], kZero);
+          v[11] = _mm_cmplt_epi32(u[11], kZero);
+          v[12] = _mm_cmplt_epi32(u[12], kZero);
+          v[13] = _mm_cmplt_epi32(u[13], kZero);
+          v[14] = _mm_cmplt_epi32(u[14], kZero);
+          v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+          u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm_sub_epi32(u[10], v[10]);
+          u[11] = _mm_sub_epi32(u[11], v[11]);
+          u[12] = _mm_sub_epi32(u[12], v[12]);
+          u[13] = _mm_sub_epi32(u[13], v[13]);
+          u[14] = _mm_sub_epi32(u[14], v[14]);
+          u[15] = _mm_sub_epi32(u[15], v[15]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], K32One);
+          v[ 1] = _mm_add_epi32(u[ 1], K32One);
+          v[ 2] = _mm_add_epi32(u[ 2], K32One);
+          v[ 3] = _mm_add_epi32(u[ 3], K32One);
+          v[ 4] = _mm_add_epi32(u[ 4], K32One);
+          v[ 5] = _mm_add_epi32(u[ 5], K32One);
+          v[ 6] = _mm_add_epi32(u[ 6], K32One);
+          v[ 7] = _mm_add_epi32(u[ 7], K32One);
+          v[ 8] = _mm_add_epi32(u[ 8], K32One);
+          v[ 9] = _mm_add_epi32(u[ 9], K32One);
+          v[10] = _mm_add_epi32(u[10], K32One);
+          v[11] = _mm_add_epi32(u[11], K32One);
+          v[12] = _mm_add_epi32(u[12], K32One);
+          v[13] = _mm_add_epi32(u[13], K32One);
+          v[14] = _mm_add_epi32(u[14], K32One);
+          v[15] = _mm_add_epi32(u[15], K32One);
+
+          u[ 0] = _mm_srai_epi32(v[ 0], 2);
+          u[ 1] = _mm_srai_epi32(v[ 1], 2);
+          u[ 2] = _mm_srai_epi32(v[ 2], 2);
+          u[ 3] = _mm_srai_epi32(v[ 3], 2);
+          u[ 4] = _mm_srai_epi32(v[ 4], 2);
+          u[ 5] = _mm_srai_epi32(v[ 5], 2);
+          u[ 6] = _mm_srai_epi32(v[ 6], 2);
+          u[ 7] = _mm_srai_epi32(v[ 7], 2);
+          u[ 8] = _mm_srai_epi32(v[ 8], 2);
+          u[ 9] = _mm_srai_epi32(v[ 9], 2);
+          u[10] = _mm_srai_epi32(v[10], 2);
+          u[11] = _mm_srai_epi32(v[11], 2);
+          u[12] = _mm_srai_epi32(v[12], 2);
+          u[13] = _mm_srai_epi32(v[13], 2);
+          u[14] = _mm_srai_epi32(v[14], 2);
+          u[15] = _mm_srai_epi32(v[15], 2);
+
+          out[ 2] = _mm_packs_epi32(u[0], u[1]);
+          out[18] = _mm_packs_epi32(u[2], u[3]);
+          out[10] = _mm_packs_epi32(u[4], u[5]);
+          out[26] = _mm_packs_epi32(u[6], u[7]);
+          out[ 6] = _mm_packs_epi32(u[8], u[9]);
+          out[22] = _mm_packs_epi32(u[10], u[11]);
+          out[14] = _mm_packs_epi32(u[12], u[13]);
+          out[30] = _mm_packs_epi32(u[14], u[15]);
+        }
+        {
+          lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
+          lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
+          lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
+          lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
+          lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
+          lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
+          lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
+          lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
+          lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
+          lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
+          lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
+          lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
+          lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
+          lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
+          lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
+          lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
+          lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
+          lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
+          lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
+          lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
+          lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
+          lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
+          lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
+          lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
+          lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
+          lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
+          lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
+          lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
+          lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
+          lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
+          lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
+          lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
+        }
+        // stage 8
+        {
+          const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
+          const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
+          const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
+          const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
+          const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
+          const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
+          const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
+          const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
+          u[ 1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
+          u[ 2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
+          u[ 3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
+          u[ 4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
+          u[ 5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
+          u[ 6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
+          u[ 7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
+          u[ 8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
+          u[ 9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
+          u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
+          u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
+          u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
+          u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
+          u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
+          u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_p31_p01);
+          v[ 1] = k_madd_epi32(u[ 1], k32_p31_p01);
+          v[ 2] = k_madd_epi32(u[ 2], k32_p31_p01);
+          v[ 3] = k_madd_epi32(u[ 3], k32_p31_p01);
+          v[ 4] = k_madd_epi32(u[ 4], k32_p15_p17);
+          v[ 5] = k_madd_epi32(u[ 5], k32_p15_p17);
+          v[ 6] = k_madd_epi32(u[ 6], k32_p15_p17);
+          v[ 7] = k_madd_epi32(u[ 7], k32_p15_p17);
+          v[ 8] = k_madd_epi32(u[ 8], k32_p23_p09);
+          v[ 9] = k_madd_epi32(u[ 9], k32_p23_p09);
+          v[10] = k_madd_epi32(u[10], k32_p23_p09);
+          v[11] = k_madd_epi32(u[11], k32_p23_p09);
+          v[12] = k_madd_epi32(u[12], k32_p07_p25);
+          v[13] = k_madd_epi32(u[13], k32_p07_p25);
+          v[14] = k_madd_epi32(u[14], k32_p07_p25);
+          v[15] = k_madd_epi32(u[15], k32_p07_p25);
+          v[16] = k_madd_epi32(u[12], k32_m25_p07);
+          v[17] = k_madd_epi32(u[13], k32_m25_p07);
+          v[18] = k_madd_epi32(u[14], k32_m25_p07);
+          v[19] = k_madd_epi32(u[15], k32_m25_p07);
+          v[20] = k_madd_epi32(u[ 8], k32_m09_p23);
+          v[21] = k_madd_epi32(u[ 9], k32_m09_p23);
+          v[22] = k_madd_epi32(u[10], k32_m09_p23);
+          v[23] = k_madd_epi32(u[11], k32_m09_p23);
+          v[24] = k_madd_epi32(u[ 4], k32_m17_p15);
+          v[25] = k_madd_epi32(u[ 5], k32_m17_p15);
+          v[26] = k_madd_epi32(u[ 6], k32_m17_p15);
+          v[27] = k_madd_epi32(u[ 7], k32_m17_p15);
+          v[28] = k_madd_epi32(u[ 0], k32_m01_p31);
+          v[29] = k_madd_epi32(u[ 1], k32_m01_p31);
+          v[30] = k_madd_epi32(u[ 2], k32_m01_p31);
+          v[31] = k_madd_epi32(u[ 3], k32_m01_p31);
+
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+          v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+          v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+          v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+          v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+          v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+          v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+          v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+          v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+          v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+          v[10] = _mm_cmplt_epi32(u[10], kZero);
+          v[11] = _mm_cmplt_epi32(u[11], kZero);
+          v[12] = _mm_cmplt_epi32(u[12], kZero);
+          v[13] = _mm_cmplt_epi32(u[13], kZero);
+          v[14] = _mm_cmplt_epi32(u[14], kZero);
+          v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+          u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm_sub_epi32(u[10], v[10]);
+          u[11] = _mm_sub_epi32(u[11], v[11]);
+          u[12] = _mm_sub_epi32(u[12], v[12]);
+          u[13] = _mm_sub_epi32(u[13], v[13]);
+          u[14] = _mm_sub_epi32(u[14], v[14]);
+          u[15] = _mm_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm_add_epi32(u[0], K32One);
+          v[1] = _mm_add_epi32(u[1], K32One);
+          v[2] = _mm_add_epi32(u[2], K32One);
+          v[3] = _mm_add_epi32(u[3], K32One);
+          v[4] = _mm_add_epi32(u[4], K32One);
+          v[5] = _mm_add_epi32(u[5], K32One);
+          v[6] = _mm_add_epi32(u[6], K32One);
+          v[7] = _mm_add_epi32(u[7], K32One);
+          v[8] = _mm_add_epi32(u[8], K32One);
+          v[9] = _mm_add_epi32(u[9], K32One);
+          v[10] = _mm_add_epi32(u[10], K32One);
+          v[11] = _mm_add_epi32(u[11], K32One);
+          v[12] = _mm_add_epi32(u[12], K32One);
+          v[13] = _mm_add_epi32(u[13], K32One);
+          v[14] = _mm_add_epi32(u[14], K32One);
+          v[15] = _mm_add_epi32(u[15], K32One);
+
+          u[0] = _mm_srai_epi32(v[0], 2);
+          u[1] = _mm_srai_epi32(v[1], 2);
+          u[2] = _mm_srai_epi32(v[2], 2);
+          u[3] = _mm_srai_epi32(v[3], 2);
+          u[4] = _mm_srai_epi32(v[4], 2);
+          u[5] = _mm_srai_epi32(v[5], 2);
+          u[6] = _mm_srai_epi32(v[6], 2);
+          u[7] = _mm_srai_epi32(v[7], 2);
+          u[8] = _mm_srai_epi32(v[8], 2);
+          u[9] = _mm_srai_epi32(v[9], 2);
+          u[10] = _mm_srai_epi32(v[10], 2);
+          u[11] = _mm_srai_epi32(v[11], 2);
+          u[12] = _mm_srai_epi32(v[12], 2);
+          u[13] = _mm_srai_epi32(v[13], 2);
+          u[14] = _mm_srai_epi32(v[14], 2);
+          u[15] = _mm_srai_epi32(v[15], 2);
+
+          out[ 1] = _mm_packs_epi32(u[0], u[1]);
+          out[17] = _mm_packs_epi32(u[2], u[3]);
+          out[ 9] = _mm_packs_epi32(u[4], u[5]);
+          out[25] = _mm_packs_epi32(u[6], u[7]);
+          out[ 7] = _mm_packs_epi32(u[8], u[9]);
+          out[23] = _mm_packs_epi32(u[10], u[11]);
+          out[15] = _mm_packs_epi32(u[12], u[13]);
+          out[31] = _mm_packs_epi32(u[14], u[15]);
+        }
+        {
+          const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
+          const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
+          const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
+          const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
+          const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
+          const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
+          const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
+          const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
+          u[ 1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
+          u[ 2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
+          u[ 3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
+          u[ 4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
+          u[ 5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
+          u[ 6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
+          u[ 7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
+          u[ 8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
+          u[ 9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
+          u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
+          u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
+          u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
+          u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
+          u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
+          u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_p27_p05);
+          v[ 1] = k_madd_epi32(u[ 1], k32_p27_p05);
+          v[ 2] = k_madd_epi32(u[ 2], k32_p27_p05);
+          v[ 3] = k_madd_epi32(u[ 3], k32_p27_p05);
+          v[ 4] = k_madd_epi32(u[ 4], k32_p11_p21);
+          v[ 5] = k_madd_epi32(u[ 5], k32_p11_p21);
+          v[ 6] = k_madd_epi32(u[ 6], k32_p11_p21);
+          v[ 7] = k_madd_epi32(u[ 7], k32_p11_p21);
+          v[ 8] = k_madd_epi32(u[ 8], k32_p19_p13);
+          v[ 9] = k_madd_epi32(u[ 9], k32_p19_p13);
+          v[10] = k_madd_epi32(u[10], k32_p19_p13);
+          v[11] = k_madd_epi32(u[11], k32_p19_p13);
+          v[12] = k_madd_epi32(u[12], k32_p03_p29);
+          v[13] = k_madd_epi32(u[13], k32_p03_p29);
+          v[14] = k_madd_epi32(u[14], k32_p03_p29);
+          v[15] = k_madd_epi32(u[15], k32_p03_p29);
+          v[16] = k_madd_epi32(u[12], k32_m29_p03);
+          v[17] = k_madd_epi32(u[13], k32_m29_p03);
+          v[18] = k_madd_epi32(u[14], k32_m29_p03);
+          v[19] = k_madd_epi32(u[15], k32_m29_p03);
+          v[20] = k_madd_epi32(u[ 8], k32_m13_p19);
+          v[21] = k_madd_epi32(u[ 9], k32_m13_p19);
+          v[22] = k_madd_epi32(u[10], k32_m13_p19);
+          v[23] = k_madd_epi32(u[11], k32_m13_p19);
+          v[24] = k_madd_epi32(u[ 4], k32_m21_p11);
+          v[25] = k_madd_epi32(u[ 5], k32_m21_p11);
+          v[26] = k_madd_epi32(u[ 6], k32_m21_p11);
+          v[27] = k_madd_epi32(u[ 7], k32_m21_p11);
+          v[28] = k_madd_epi32(u[ 0], k32_m05_p27);
+          v[29] = k_madd_epi32(u[ 1], k32_m05_p27);
+          v[30] = k_madd_epi32(u[ 2], k32_m05_p27);
+          v[31] = k_madd_epi32(u[ 3], k32_m05_p27);
+
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+          v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+          v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+          v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+          v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+          v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+          v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+          v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+          v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+          v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+          v[10] = _mm_cmplt_epi32(u[10], kZero);
+          v[11] = _mm_cmplt_epi32(u[11], kZero);
+          v[12] = _mm_cmplt_epi32(u[12], kZero);
+          v[13] = _mm_cmplt_epi32(u[13], kZero);
+          v[14] = _mm_cmplt_epi32(u[14], kZero);
+          v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+          u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm_sub_epi32(u[10], v[10]);
+          u[11] = _mm_sub_epi32(u[11], v[11]);
+          u[12] = _mm_sub_epi32(u[12], v[12]);
+          u[13] = _mm_sub_epi32(u[13], v[13]);
+          u[14] = _mm_sub_epi32(u[14], v[14]);
+          u[15] = _mm_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm_add_epi32(u[0], K32One);
+          v[1] = _mm_add_epi32(u[1], K32One);
+          v[2] = _mm_add_epi32(u[2], K32One);
+          v[3] = _mm_add_epi32(u[3], K32One);
+          v[4] = _mm_add_epi32(u[4], K32One);
+          v[5] = _mm_add_epi32(u[5], K32One);
+          v[6] = _mm_add_epi32(u[6], K32One);
+          v[7] = _mm_add_epi32(u[7], K32One);
+          v[8] = _mm_add_epi32(u[8], K32One);
+          v[9] = _mm_add_epi32(u[9], K32One);
+          v[10] = _mm_add_epi32(u[10], K32One);
+          v[11] = _mm_add_epi32(u[11], K32One);
+          v[12] = _mm_add_epi32(u[12], K32One);
+          v[13] = _mm_add_epi32(u[13], K32One);
+          v[14] = _mm_add_epi32(u[14], K32One);
+          v[15] = _mm_add_epi32(u[15], K32One);
+
+          u[0] = _mm_srai_epi32(v[0], 2);
+          u[1] = _mm_srai_epi32(v[1], 2);
+          u[2] = _mm_srai_epi32(v[2], 2);
+          u[3] = _mm_srai_epi32(v[3], 2);
+          u[4] = _mm_srai_epi32(v[4], 2);
+          u[5] = _mm_srai_epi32(v[5], 2);
+          u[6] = _mm_srai_epi32(v[6], 2);
+          u[7] = _mm_srai_epi32(v[7], 2);
+          u[8] = _mm_srai_epi32(v[8], 2);
+          u[9] = _mm_srai_epi32(v[9], 2);
+          u[10] = _mm_srai_epi32(v[10], 2);
+          u[11] = _mm_srai_epi32(v[11], 2);
+          u[12] = _mm_srai_epi32(v[12], 2);
+          u[13] = _mm_srai_epi32(v[13], 2);
+          u[14] = _mm_srai_epi32(v[14], 2);
+          u[15] = _mm_srai_epi32(v[15], 2);
+
+          out[ 5] = _mm_packs_epi32(u[0], u[1]);
+          out[21] = _mm_packs_epi32(u[2], u[3]);
+          out[13] = _mm_packs_epi32(u[4], u[5]);
+          out[29] = _mm_packs_epi32(u[6], u[7]);
+          out[ 3] = _mm_packs_epi32(u[8], u[9]);
+          out[19] = _mm_packs_epi32(u[10], u[11]);
+          out[11] = _mm_packs_epi32(u[12], u[13]);
+          out[27] = _mm_packs_epi32(u[14], u[15]);
+        }
+      }
+#endif
+      // Transpose the results, do it as four 8x8 transposes.
+      {
+        int transpose_block;
+        int16_t *output;
+        if (0 == pass) {
+          output = &intermediate[column_start * 32];
+        } else {
+          output = &output_org[column_start * 32];
+        }
+        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+          __m128i *this_out = &out[8 * transpose_block];
+          // 00 01 02 03 04 05 06 07
+          // 10 11 12 13 14 15 16 17
+          // 20 21 22 23 24 25 26 27
+          // 30 31 32 33 34 35 36 37
+          // 40 41 42 43 44 45 46 47
+          // 50 51 52 53 54 55 56 57
+          // 60 61 62 63 64 65 66 67
+          // 70 71 72 73 74 75 76 77
+          const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
+          const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
+          const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
+          const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
+          const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
+          const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
+          const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
+          const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
+          // 00 10 01 11 02 12 03 13
+          // 20 30 21 31 22 32 23 33
+          // 04 14 05 15 06 16 07 17
+          // 24 34 25 35 26 36 27 37
+          // 40 50 41 51 42 52 43 53
+          // 60 70 61 71 62 72 63 73
+          // 54 54 55 55 56 56 57 57
+          // 64 74 65 75 66 76 67 77
+          const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+          const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+          const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+          const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+          const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+          const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+          const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+          const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+          // 00 10 20 30 01 11 21 31
+          // 40 50 60 70 41 51 61 71
+          // 02 12 22 32 03 13 23 33
+          // 42 52 62 72 43 53 63 73
+          // 04 14 24 34 05 15 21 36
+          // 44 54 64 74 45 55 61 76
+          // 06 16 26 36 07 17 27 37
+          // 46 56 66 76 47 57 67 77
+          __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+          __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+          __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+          __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+          __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+          __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+          __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+          __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+          // 00 10 20 30 40 50 60 70
+          // 01 11 21 31 41 51 61 71
+          // 02 12 22 32 42 52 62 72
+          // 03 13 23 33 43 53 63 73
+          // 04 14 24 34 44 54 64 74
+          // 05 15 25 35 45 55 65 75
+          // 06 16 26 36 46 56 66 76
+          // 07 17 27 37 47 57 67 77
+          if (0 == pass) {
+            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+            // TODO(cd): see quality impact of only doing
+            //           output[j] = (output[j] + 1) >> 2;
+            //           which would remove the code between here ...
+            __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
+            __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
+            __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
+            __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
+            __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
+            __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
+            __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
+            __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
+            tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
+            tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
+            tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
+            tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
+            tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
+            tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
+            tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
+            tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
+            //           ... and here.
+            //           PS: also change code in vp9/encoder/vp9_dct.c
+            tr2_0 = _mm_add_epi16(tr2_0, kOne);
+            tr2_1 = _mm_add_epi16(tr2_1, kOne);
+            tr2_2 = _mm_add_epi16(tr2_2, kOne);
+            tr2_3 = _mm_add_epi16(tr2_3, kOne);
+            tr2_4 = _mm_add_epi16(tr2_4, kOne);
+            tr2_5 = _mm_add_epi16(tr2_5, kOne);
+            tr2_6 = _mm_add_epi16(tr2_6, kOne);
+            tr2_7 = _mm_add_epi16(tr2_7, kOne);
+            tr2_0 = _mm_srai_epi16(tr2_0, 2);
+            tr2_1 = _mm_srai_epi16(tr2_1, 2);
+            tr2_2 = _mm_srai_epi16(tr2_2, 2);
+            tr2_3 = _mm_srai_epi16(tr2_3, 2);
+            tr2_4 = _mm_srai_epi16(tr2_4, 2);
+            tr2_5 = _mm_srai_epi16(tr2_5, 2);
+            tr2_6 = _mm_srai_epi16(tr2_6, 2);
+            tr2_7 = _mm_srai_epi16(tr2_7, 2);
+          }
+          // Note: even though all these stores are aligned, using the aligned
+          //       intrinsic make the code slightly slower.
+          _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0);
+          _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1);
+          _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2);
+          _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3);
+          _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4);
+          _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5);
+          _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6);
+          _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7);
+          // Process next 8x8
+          output += 8;
+        }
+      }
+    }
+  }
+}
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index bf09c7a..eb271fe 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -2572,1224 +2572,14 @@ void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
   write_buffer_16x16(output, in0, in1, 16);
 }
 
-void vp9_short_fdct32x32_rd_sse2(int16_t *input,
-                                 int16_t *output_org, int pitch) {
-  // Calculate pre-multiplied strides
-  const int str1 = pitch >> 1;
-  const int str2 = pitch;
-  const int str3 = pitch + str1;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64,   cospi_24_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64,  cospi_8_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64,  cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64,  cospi_12_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64,   cospi_28_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64,  cospi_4_64);
-  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64,  cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64,  cospi_18_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64,  cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64,   cospi_26_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64,  cospi_6_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64,  cospi_22_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64,  cospi_14_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64,   cospi_30_64);
-  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64,  cospi_1_64);
-  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64,  cospi_17_64);
-  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64,  cospi_9_64);
-  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64,   cospi_25_64);
-  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64,  cospi_7_64);
-  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64,   cospi_23_64);
-  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64,  cospi_15_64);
-  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64,   cospi_31_64);
-  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64,  cospi_5_64);
-  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64,  cospi_21_64);
-  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64,  cospi_13_64);
-  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64,   cospi_29_64);
-  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64,  cospi_3_64);
-  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64,  cospi_19_64);
-  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64,  cospi_11_64);
-  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64,   cospi_27_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i kOne  = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  int pass;
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-    for (column_start = 0; column_start < 32; column_start += 8) {
-      __m128i step1[32];
-      __m128i step2[32];
-      __m128i step3[32];
-      __m128i out[32];
-      // Stage 1
-      // Note: even though all the loads below are aligned, using the aligned
-      //       intrinsic make the code slightly slower.
-      if (0 == pass) {
-        int16_t *in  = &input[column_start];
-        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          int16_t *ina =  in +  0 * str1;
-          int16_t *inb =  in + 31 * str1;
-          __m128i *step1a = &step1[ 0];
-          __m128i *step1b = &step1[31];
-          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[ 0] = _mm_add_epi16(ina0, inb0);
-          step1a[ 1] = _mm_add_epi16(ina1, inb1);
-          step1a[ 2] = _mm_add_epi16(ina2, inb2);
-          step1a[ 3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
-          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
-          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
-          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          int16_t *ina =  in +  4 * str1;
-          int16_t *inb =  in + 27 * str1;
-          __m128i *step1a = &step1[ 4];
-          __m128i *step1b = &step1[27];
-          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[ 0] = _mm_add_epi16(ina0, inb0);
-          step1a[ 1] = _mm_add_epi16(ina1, inb1);
-          step1a[ 2] = _mm_add_epi16(ina2, inb2);
-          step1a[ 3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
-          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
-          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
-          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          int16_t *ina =  in +  8 * str1;
-          int16_t *inb =  in + 23 * str1;
-          __m128i *step1a = &step1[ 8];
-          __m128i *step1b = &step1[23];
-          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[ 0] = _mm_add_epi16(ina0, inb0);
-          step1a[ 1] = _mm_add_epi16(ina1, inb1);
-          step1a[ 2] = _mm_add_epi16(ina2, inb2);
-          step1a[ 3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
-          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
-          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
-          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          int16_t *ina =  in + 12 * str1;
-          int16_t *inb =  in + 19 * str1;
-          __m128i *step1a = &step1[12];
-          __m128i *step1b = &step1[19];
-          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[ 0] = _mm_add_epi16(ina0, inb0);
-          step1a[ 1] = _mm_add_epi16(ina1, inb1);
-          step1a[ 2] = _mm_add_epi16(ina2, inb2);
-          step1a[ 3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
-          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
-          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
-          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-      } else {
-        int16_t *in = &intermediate[column_start];
-        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
-        // Note: using the same approach as above to have common offset is
-        //       counter-productive as all offsets can be calculated at compile
-        //       time.
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          __m128i in00  = _mm_loadu_si128((const __m128i *)(in +  0 * 32));
-          __m128i in01  = _mm_loadu_si128((const __m128i *)(in +  1 * 32));
-          __m128i in02  = _mm_loadu_si128((const __m128i *)(in +  2 * 32));
-          __m128i in03  = _mm_loadu_si128((const __m128i *)(in +  3 * 32));
-          __m128i in28  = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
-          __m128i in29  = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
-          __m128i in30  = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
-          __m128i in31  = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
-          step1[ 0] = _mm_add_epi16(in00, in31);
-          step1[ 1] = _mm_add_epi16(in01, in30);
-          step1[ 2] = _mm_add_epi16(in02, in29);
-          step1[ 3] = _mm_add_epi16(in03, in28);
-          step1[28] = _mm_sub_epi16(in03, in28);
-          step1[29] = _mm_sub_epi16(in02, in29);
-          step1[30] = _mm_sub_epi16(in01, in30);
-          step1[31] = _mm_sub_epi16(in00, in31);
-        }
-        {
-          __m128i in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 32));
-          __m128i in05  = _mm_loadu_si128((const __m128i *)(in +  5 * 32));
-          __m128i in06  = _mm_loadu_si128((const __m128i *)(in +  6 * 32));
-          __m128i in07  = _mm_loadu_si128((const __m128i *)(in +  7 * 32));
-          __m128i in24  = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
-          __m128i in25  = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
-          __m128i in26  = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
-          __m128i in27  = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
-          step1[ 4] = _mm_add_epi16(in04, in27);
-          step1[ 5] = _mm_add_epi16(in05, in26);
-          step1[ 6] = _mm_add_epi16(in06, in25);
-          step1[ 7] = _mm_add_epi16(in07, in24);
-          step1[24] = _mm_sub_epi16(in07, in24);
-          step1[25] = _mm_sub_epi16(in06, in25);
-          step1[26] = _mm_sub_epi16(in05, in26);
-          step1[27] = _mm_sub_epi16(in04, in27);
-        }
-        {
-          __m128i in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 32));
-          __m128i in09  = _mm_loadu_si128((const __m128i *)(in +  9 * 32));
-          __m128i in10  = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
-          __m128i in11  = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
-          __m128i in20  = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
-          __m128i in21  = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
-          __m128i in22  = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
-          __m128i in23  = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
-          step1[ 8] = _mm_add_epi16(in08, in23);
-          step1[ 9] = _mm_add_epi16(in09, in22);
-          step1[10] = _mm_add_epi16(in10, in21);
-          step1[11] = _mm_add_epi16(in11, in20);
-          step1[20] = _mm_sub_epi16(in11, in20);
-          step1[21] = _mm_sub_epi16(in10, in21);
-          step1[22] = _mm_sub_epi16(in09, in22);
-          step1[23] = _mm_sub_epi16(in08, in23);
-        }
-        {
-          __m128i in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
-          __m128i in13  = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
-          __m128i in14  = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
-          __m128i in15  = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
-          __m128i in16  = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
-          __m128i in17  = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
-          __m128i in18  = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
-          __m128i in19  = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
-          step1[12] = _mm_add_epi16(in12, in19);
-          step1[13] = _mm_add_epi16(in13, in18);
-          step1[14] = _mm_add_epi16(in14, in17);
-          step1[15] = _mm_add_epi16(in15, in16);
-          step1[16] = _mm_sub_epi16(in15, in16);
-          step1[17] = _mm_sub_epi16(in14, in17);
-          step1[18] = _mm_sub_epi16(in13, in18);
-          step1[19] = _mm_sub_epi16(in12, in19);
-        }
-      }
-      // Stage 2
-      {
-        step2[ 0] = _mm_add_epi16(step1[0], step1[15]);
-        step2[ 1] = _mm_add_epi16(step1[1], step1[14]);
-        step2[ 2] = _mm_add_epi16(step1[2], step1[13]);
-        step2[ 3] = _mm_add_epi16(step1[3], step1[12]);
-        step2[ 4] = _mm_add_epi16(step1[4], step1[11]);
-        step2[ 5] = _mm_add_epi16(step1[5], step1[10]);
-        step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]);
-        step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]);
-        step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]);
-        step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]);
-        step2[10] = _mm_sub_epi16(step1[5], step1[10]);
-        step2[11] = _mm_sub_epi16(step1[4], step1[11]);
-        step2[12] = _mm_sub_epi16(step1[3], step1[12]);
-        step2[13] = _mm_sub_epi16(step1[2], step1[13]);
-        step2[14] = _mm_sub_epi16(step1[1], step1[14]);
-        step2[15] = _mm_sub_epi16(step1[0], step1[15]);
-      }
-      {
-        const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
-        const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
-        const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
-        const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
-        const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
-        const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
-        const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
-        const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
-        const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
-        const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
-        const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
-        const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
-        const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
-        const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
-        const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
-        const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
-        const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
-        const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
-        const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
-        const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
-        const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
-        const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
-        const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
-        const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
-        const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
-        const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
-        const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
-        const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
-        const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
-        const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
-        const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
-        const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
-        const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
-        const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
-        const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
-        const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
-        const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
-        const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
-        const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
-        // Combine
-        step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
-        step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
-        step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
-        step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
-        step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
-        step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
-        step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
-        step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
-      }
-      // Stage 3
-      {
-        step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
-        step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
-        step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
-        step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
-        step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
-        step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
-        step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
-        step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
-      }
-      {
-        const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-        const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-        const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-        const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-        const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-        const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-        const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-        const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-        const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-        const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-        const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-        const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-        const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-        const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-        const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-        const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-        const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-        const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-        const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-        // Combine
-        step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
-        step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
-        step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
-        step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-      }
-      {
-        step3[16] = _mm_add_epi16(step2[23], step1[16]);
-        step3[17] = _mm_add_epi16(step2[22], step1[17]);
-        step3[18] = _mm_add_epi16(step2[21], step1[18]);
-        step3[19] = _mm_add_epi16(step2[20], step1[19]);
-        step3[20] = _mm_sub_epi16(step1[19], step2[20]);
-        step3[21] = _mm_sub_epi16(step1[18], step2[21]);
-        step3[22] = _mm_sub_epi16(step1[17], step2[22]);
-        step3[23] = _mm_sub_epi16(step1[16], step2[23]);
-        step3[24] = _mm_sub_epi16(step1[31], step2[24]);
-        step3[25] = _mm_sub_epi16(step1[30], step2[25]);
-        step3[26] = _mm_sub_epi16(step1[29], step2[26]);
-        step3[27] = _mm_sub_epi16(step1[28], step2[27]);
-        step3[28] = _mm_add_epi16(step2[27], step1[28]);
-        step3[29] = _mm_add_epi16(step2[26], step1[29]);
-        step3[30] = _mm_add_epi16(step2[25], step1[30]);
-        step3[31] = _mm_add_epi16(step2[24], step1[31]);
-      }
-      // dump the magnitude by half, hence the intermediate values are within
-      // the range of 16 bits.
-      if (1 == pass) {
-        __m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero);
-        __m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero);
-        __m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero);
-        __m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero);
-        __m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero);
-        __m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero);
-        __m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero);
-        __m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero);
-        __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
-        __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
-        __m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero);
-        __m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero);
-        __m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero);
-        __m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero);
-        __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
-        __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
-        __m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero);
-        __m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero);
-        __m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero);
-        __m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero);
-        __m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero);
-        __m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero);
-        __m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero);
-        __m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero);
-        __m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero);
-        __m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero);
-        __m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero);
-        __m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero);
-        __m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero);
-        __m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero);
-        __m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero);
-        __m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero);
-        step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0);
-        step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0);
-        step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0);
-        step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0);
-        step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0);
-        step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0);
-        step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0);
-        step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0);
-        step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
-        step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
-        step3[10] = _mm_sub_epi16(step3[10], s3_10_0);
-        step3[11] = _mm_sub_epi16(step3[11], s3_11_0);
-        step3[12] = _mm_sub_epi16(step3[12], s3_12_0);
-        step3[13] = _mm_sub_epi16(step3[13], s3_13_0);
-        step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
-        step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
-        step3[16] = _mm_sub_epi16(step3[16], s3_16_0);
-        step3[17] = _mm_sub_epi16(step3[17], s3_17_0);
-        step3[18] = _mm_sub_epi16(step3[18], s3_18_0);
-        step3[19] = _mm_sub_epi16(step3[19], s3_19_0);
-        step3[20] = _mm_sub_epi16(step3[20], s3_20_0);
-        step3[21] = _mm_sub_epi16(step3[21], s3_21_0);
-        step3[22] = _mm_sub_epi16(step3[22], s3_22_0);
-        step3[23] = _mm_sub_epi16(step3[23], s3_23_0);
-        step3[24] = _mm_sub_epi16(step3[24], s3_24_0);
-        step3[25] = _mm_sub_epi16(step3[25], s3_25_0);
-        step3[26] = _mm_sub_epi16(step3[26], s3_26_0);
-        step3[27] = _mm_sub_epi16(step3[27], s3_27_0);
-        step3[28] = _mm_sub_epi16(step3[28], s3_28_0);
-        step3[29] = _mm_sub_epi16(step3[29], s3_29_0);
-        step3[30] = _mm_sub_epi16(step3[30], s3_30_0);
-        step3[31] = _mm_sub_epi16(step3[31], s3_31_0);
-        step3[ 0] = _mm_add_epi16(step3[ 0], kOne);
-        step3[ 1] = _mm_add_epi16(step3[ 1], kOne);
-        step3[ 2] = _mm_add_epi16(step3[ 2], kOne);
-        step3[ 3] = _mm_add_epi16(step3[ 3], kOne);
-        step3[ 4] = _mm_add_epi16(step3[ 4], kOne);
-        step3[ 5] = _mm_add_epi16(step3[ 5], kOne);
-        step3[ 6] = _mm_add_epi16(step3[ 6], kOne);
-        step3[ 7] = _mm_add_epi16(step3[ 7], kOne);
-        step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
-        step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
-        step3[10] = _mm_add_epi16(step3[10], kOne);
-        step3[11] = _mm_add_epi16(step3[11], kOne);
-        step3[12] = _mm_add_epi16(step3[12], kOne);
-        step3[13] = _mm_add_epi16(step3[13], kOne);
-        step2[14] = _mm_add_epi16(step2[14], kOne);
-        step2[15] = _mm_add_epi16(step2[15], kOne);
-        step3[16] = _mm_add_epi16(step3[16], kOne);
-        step3[17] = _mm_add_epi16(step3[17], kOne);
-        step3[18] = _mm_add_epi16(step3[18], kOne);
-        step3[19] = _mm_add_epi16(step3[19], kOne);
-        step3[20] = _mm_add_epi16(step3[20], kOne);
-        step3[21] = _mm_add_epi16(step3[21], kOne);
-        step3[22] = _mm_add_epi16(step3[22], kOne);
-        step3[23] = _mm_add_epi16(step3[23], kOne);
-        step3[24] = _mm_add_epi16(step3[24], kOne);
-        step3[25] = _mm_add_epi16(step3[25], kOne);
-        step3[26] = _mm_add_epi16(step3[26], kOne);
-        step3[27] = _mm_add_epi16(step3[27], kOne);
-        step3[28] = _mm_add_epi16(step3[28], kOne);
-        step3[29] = _mm_add_epi16(step3[29], kOne);
-        step3[30] = _mm_add_epi16(step3[30], kOne);
-        step3[31] = _mm_add_epi16(step3[31], kOne);
-        step3[ 0] = _mm_srai_epi16(step3[ 0], 2);
-        step3[ 1] = _mm_srai_epi16(step3[ 1], 2);
-        step3[ 2] = _mm_srai_epi16(step3[ 2], 2);
-        step3[ 3] = _mm_srai_epi16(step3[ 3], 2);
-        step3[ 4] = _mm_srai_epi16(step3[ 4], 2);
-        step3[ 5] = _mm_srai_epi16(step3[ 5], 2);
-        step3[ 6] = _mm_srai_epi16(step3[ 6], 2);
-        step3[ 7] = _mm_srai_epi16(step3[ 7], 2);
-        step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
-        step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
-        step3[10] = _mm_srai_epi16(step3[10], 2);
-        step3[11] = _mm_srai_epi16(step3[11], 2);
-        step3[12] = _mm_srai_epi16(step3[12], 2);
-        step3[13] = _mm_srai_epi16(step3[13], 2);
-        step2[14] = _mm_srai_epi16(step2[14], 2);
-        step2[15] = _mm_srai_epi16(step2[15], 2);
-        step3[16] = _mm_srai_epi16(step3[16], 2);
-        step3[17] = _mm_srai_epi16(step3[17], 2);
-        step3[18] = _mm_srai_epi16(step3[18], 2);
-        step3[19] = _mm_srai_epi16(step3[19], 2);
-        step3[20] = _mm_srai_epi16(step3[20], 2);
-        step3[21] = _mm_srai_epi16(step3[21], 2);
-        step3[22] = _mm_srai_epi16(step3[22], 2);
-        step3[23] = _mm_srai_epi16(step3[23], 2);
-        step3[24] = _mm_srai_epi16(step3[24], 2);
-        step3[25] = _mm_srai_epi16(step3[25], 2);
-        step3[26] = _mm_srai_epi16(step3[26], 2);
-        step3[27] = _mm_srai_epi16(step3[27], 2);
-        step3[28] = _mm_srai_epi16(step3[28], 2);
-        step3[29] = _mm_srai_epi16(step3[29], 2);
-        step3[30] = _mm_srai_epi16(step3[30], 2);
-        step3[31] = _mm_srai_epi16(step3[31], 2);
-      }
-      // Stage 4
-      {
-        step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
-        step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]);
-        step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]);
-        step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]);
-        step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]);
-        step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]);
-        step1[10] = _mm_sub_epi16(step2[ 9], step3[10]);
-        step1[11] = _mm_sub_epi16(step2[ 8], step3[11]);
-        step1[12] = _mm_sub_epi16(step2[15], step3[12]);
-        step1[13] = _mm_sub_epi16(step2[14], step3[13]);
-        step1[14] = _mm_add_epi16(step3[13], step2[14]);
-        step1[15] = _mm_add_epi16(step3[12], step2[15]);
-      }
-      {
-        const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
-        const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
-        const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
-        const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
-        const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
-        const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
-        const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
-        const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
-        const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
-        // Combine
-        step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
-        step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-      }
-      {
-        const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
-        const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
-        const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
-        const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
-        const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
-        const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
-        const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
-        const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
-        const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
-        const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
-        const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
-        const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
-        const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
-        const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
-        const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
-        const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
-        const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
-        const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
-        const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
-        const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
-        const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
-        const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
-        const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
-        const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
-        // dct_const_round_shift
-        const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
-        const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
-        const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
-        const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
-        const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
-        const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
-        const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
-        const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
-        const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
-        const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
-        const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
-        const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
-        const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
-        const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
-        const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
-        const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
-        // Combine
-        step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
-        step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
-        step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
-        step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
-        step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
-        step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
-        step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
-        step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
-      }
-      // Stage 5
-      {
-        step2[4] = _mm_add_epi16(step1[5], step3[4]);
-        step2[5] = _mm_sub_epi16(step3[4], step1[5]);
-        step2[6] = _mm_sub_epi16(step3[7], step1[6]);
-        step2[7] = _mm_add_epi16(step1[6], step3[7]);
-      }
-      {
-        const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
-        const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
-        const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
-        const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
-        const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
-        const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
-        const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
-        const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
-        const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
-        const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
-        const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
-        const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
-        // dct_const_round_shift
-        const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
-        const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
-        const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
-        const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
-        const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
-        const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
-        const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
-        const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
-        // Combine
-        out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
-        out[16] = _mm_packs_epi32(out_16_6, out_16_7);
-        out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
-        out[24] = _mm_packs_epi32(out_24_6, out_24_7);
-      }
-      {
-        const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
-        const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
-        const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
-        const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
-        const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
-        const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
-        const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
-        const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
-        const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
-        const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
-        const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
-        const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
-        // dct_const_round_shift
-        const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
-        const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
-        const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
-        const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
-        const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
-        const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
-        const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
-        const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
-        // Combine
-        step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
-        step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
-        step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
-        step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-      }
-      {
-        step2[16] = _mm_add_epi16(step1[19], step3[16]);
-        step2[17] = _mm_add_epi16(step1[18], step3[17]);
-        step2[18] = _mm_sub_epi16(step3[17], step1[18]);
-        step2[19] = _mm_sub_epi16(step3[16], step1[19]);
-        step2[20] = _mm_sub_epi16(step3[23], step1[20]);
-        step2[21] = _mm_sub_epi16(step3[22], step1[21]);
-        step2[22] = _mm_add_epi16(step1[21], step3[22]);
-        step2[23] = _mm_add_epi16(step1[20], step3[23]);
-        step2[24] = _mm_add_epi16(step1[27], step3[24]);
-        step2[25] = _mm_add_epi16(step1[26], step3[25]);
-        step2[26] = _mm_sub_epi16(step3[25], step1[26]);
-        step2[27] = _mm_sub_epi16(step3[24], step1[27]);
-        step2[28] = _mm_sub_epi16(step3[31], step1[28]);
-        step2[29] = _mm_sub_epi16(step3[30], step1[29]);
-        step2[30] = _mm_add_epi16(step1[29], step3[30]);
-        step2[31] = _mm_add_epi16(step1[28], step3[31]);
-      }
-      // Stage 6
-      {
-        const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-        const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-        const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-        const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-        const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-        const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-        const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-        const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-        const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
-        const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
-        const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
-        const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
-        const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
-        const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
-        const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
-        const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
-        // dct_const_round_shift
-        const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
-        const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
-        const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
-        const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
-        const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
-        const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
-        const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
-        const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
-        // Combine
-        out[ 4] = _mm_packs_epi32(out_04_6, out_04_7);
-        out[20] = _mm_packs_epi32(out_20_6, out_20_7);
-        out[12] = _mm_packs_epi32(out_12_6, out_12_7);
-        out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-      }
-      {
-        step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]);
-        step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]);
-        step3[10] = _mm_sub_epi16(step1[11], step2[10]);
-        step3[11] = _mm_add_epi16(step2[10], step1[11]);
-        step3[12] = _mm_add_epi16(step2[13], step1[12]);
-        step3[13] = _mm_sub_epi16(step1[12], step2[13]);
-        step3[14] = _mm_sub_epi16(step1[15], step2[14]);
-        step3[15] = _mm_add_epi16(step2[14], step1[15]);
-      }
-      {
-        const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
-        const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
-        const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
-        const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
-        const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
-        const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
-        const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
-        const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
-        const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
-        const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
-        const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
-        const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
-        const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
-        const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
-        const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
-        const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
-        const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
-        const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
-        const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
-        const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
-        const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
-        const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
-        const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
-        const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
-        // dct_const_round_shift
-        const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
-        const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
-        const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
-        const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
-        const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
-        const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
-        const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
-        const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
-        const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
-        const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
-        const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
-        const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
-        const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
-        const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
-        const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
-        const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
-        // Combine
-        step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
-        step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
-        step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
-        step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
-        // Combine
-        step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
-        step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
-        step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
-        step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-      }
-      // Stage 7
-      {
-        const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
-        const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
-        const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
-        const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
-        const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
-        const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
-        const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
-        const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
-        const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
-        const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
-        const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
-        const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
-        const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
-        const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
-        const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
-        const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
-        const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
-        const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
-        const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
-        const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
-        const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
-        const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
-        const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
-        const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
-        // dct_const_round_shift
-        const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
-        const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
-        const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
-        const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
-        const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
-        const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
-        const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
-        const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
-        const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
-        const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
-        const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
-        const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
-        const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
-        const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
-        const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
-        const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
-        // Combine
-        out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
-        out[18] = _mm_packs_epi32(out_18_6, out_18_7);
-        out[10] = _mm_packs_epi32(out_10_6, out_10_7);
-        out[26] = _mm_packs_epi32(out_26_6, out_26_7);
-        out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
-        out[22] = _mm_packs_epi32(out_22_6, out_22_7);
-        out[14] = _mm_packs_epi32(out_14_6, out_14_7);
-        out[30] = _mm_packs_epi32(out_30_6, out_30_7);
-      }
-      {
-        step1[16] = _mm_add_epi16(step3[17], step2[16]);
-        step1[17] = _mm_sub_epi16(step2[16], step3[17]);
-        step1[18] = _mm_sub_epi16(step2[19], step3[18]);
-        step1[19] = _mm_add_epi16(step3[18], step2[19]);
-        step1[20] = _mm_add_epi16(step3[21], step2[20]);
-        step1[21] = _mm_sub_epi16(step2[20], step3[21]);
-        step1[22] = _mm_sub_epi16(step2[23], step3[22]);
-        step1[23] = _mm_add_epi16(step3[22], step2[23]);
-        step1[24] = _mm_add_epi16(step3[25], step2[24]);
-        step1[25] = _mm_sub_epi16(step2[24], step3[25]);
-        step1[26] = _mm_sub_epi16(step2[27], step3[26]);
-        step1[27] = _mm_add_epi16(step3[26], step2[27]);
-        step1[28] = _mm_add_epi16(step3[29], step2[28]);
-        step1[29] = _mm_sub_epi16(step2[28], step3[29]);
-        step1[30] = _mm_sub_epi16(step2[31], step3[30]);
-        step1[31] = _mm_add_epi16(step3[30], step2[31]);
-      }
-      // Final stage --- outputs indices are bit-reversed.
-      {
-        const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
-        const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
-        const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
-        const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
-        const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
-        const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
-        const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
-        const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
-        const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
-        const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
-        const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
-        const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
-        const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
-        const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
-        const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
-        const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
-        const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
-        const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
-        const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
-        const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
-        const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
-        const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
-        const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
-        const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
-        // dct_const_round_shift
-        const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
-        const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
-        const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
-        const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
-        const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
-        const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
-        const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
-        const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
-        const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
-        const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
-        const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
-        const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
-        const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
-        const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
-        const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
-        const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
-        // Combine
-        out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
-        out[17] = _mm_packs_epi32(out_17_6, out_17_7);
-        out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
-        out[25] = _mm_packs_epi32(out_25_6, out_25_7);
-        out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
-        out[23] = _mm_packs_epi32(out_23_6, out_23_7);
-        out[15] = _mm_packs_epi32(out_15_6, out_15_7);
-        out[31] = _mm_packs_epi32(out_31_6, out_31_7);
-      }
-      {
-        const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
-        const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
-        const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
-        const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
-        const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
-        const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
-        const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
-        const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
-        const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
-        const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
-        const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
-        const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
-        const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
-        const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
-        const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
-        const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
-        const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
-        const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
-        const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
-        const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
-        const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
-        const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
-        const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
-        const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
-        // dct_const_round_shift
-        const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
-        const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
-        const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
-        const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
-        const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
-        const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
-        const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
-        const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
-        const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
-        const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
-        const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
-        const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
-        const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
-        const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
-        const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
-        const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
-        // Combine
-        out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
-        out[21] = _mm_packs_epi32(out_21_6, out_21_7);
-        out[13] = _mm_packs_epi32(out_13_6, out_13_7);
-        out[29] = _mm_packs_epi32(out_29_6, out_29_7);
-        out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
-        out[19] = _mm_packs_epi32(out_19_6, out_19_7);
-        out[11] = _mm_packs_epi32(out_11_6, out_11_7);
-        out[27] = _mm_packs_epi32(out_27_6, out_27_7);
-      }
-      // Transpose the results, do it as four 8x8 transposes.
-      {
-        int transpose_block;
-        int16_t *output;
-        if (0 == pass) {
-          output = &intermediate[column_start * 32];
-        } else {
-          output = &output_org[column_start * 32];
-        }
-        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
-          __m128i *this_out = &out[8 * transpose_block];
-          // 00 01 02 03 04 05 06 07
-          // 10 11 12 13 14 15 16 17
-          // 20 21 22 23 24 25 26 27
-          // 30 31 32 33 34 35 36 37
-          // 40 41 42 43 44 45 46 47
-          // 50 51 52 53 54 55 56 57
-          // 60 61 62 63 64 65 66 67
-          // 70 71 72 73 74 75 76 77
-          const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
-          const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
-          const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
-          const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
-          const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
-          const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
-          const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
-          const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
-          // 00 10 01 11 02 12 03 13
-          // 20 30 21 31 22 32 23 33
-          // 04 14 05 15 06 16 07 17
-          // 24 34 25 35 26 36 27 37
-          // 40 50 41 51 42 52 43 53
-          // 60 70 61 71 62 72 63 73
-          // 54 54 55 55 56 56 57 57
-          // 64 74 65 75 66 76 67 77
-          const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-          const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-          const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-          const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-          const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-          const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-          const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-          const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-          // 00 10 20 30 01 11 21 31
-          // 40 50 60 70 41 51 61 71
-          // 02 12 22 32 03 13 23 33
-          // 42 52 62 72 43 53 63 73
-          // 04 14 24 34 05 15 21 36
-          // 44 54 64 74 45 55 61 76
-          // 06 16 26 36 07 17 27 37
-          // 46 56 66 76 47 57 67 77
-          __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-          __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-          __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-          __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-          __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-          __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-          __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-          __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-          // 00 10 20 30 40 50 60 70
-          // 01 11 21 31 41 51 61 71
-          // 02 12 22 32 42 52 62 72
-          // 03 13 23 33 43 53 63 73
-          // 04 14 24 34 44 54 64 74
-          // 05 15 25 35 45 55 65 75
-          // 06 16 26 36 46 56 66 76
-          // 07 17 27 37 47 57 67 77
-          if (0 == pass) {
-            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
-            // TODO(cd): see quality impact of only doing
-            //           output[j] = (output[j] + 1) >> 2;
-            //           which would remove the code between here ...
-            __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
-            __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
-            __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
-            __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
-            __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
-            __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
-            __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
-            __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
-            tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
-            tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
-            tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
-            tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
-            tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
-            tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
-            tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
-            tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
-            //           ... and here.
-            //           PS: also change code in vp9/encoder/vp9_dct.c
-            tr2_0 = _mm_add_epi16(tr2_0, kOne);
-            tr2_1 = _mm_add_epi16(tr2_1, kOne);
-            tr2_2 = _mm_add_epi16(tr2_2, kOne);
-            tr2_3 = _mm_add_epi16(tr2_3, kOne);
-            tr2_4 = _mm_add_epi16(tr2_4, kOne);
-            tr2_5 = _mm_add_epi16(tr2_5, kOne);
-            tr2_6 = _mm_add_epi16(tr2_6, kOne);
-            tr2_7 = _mm_add_epi16(tr2_7, kOne);
-            tr2_0 = _mm_srai_epi16(tr2_0, 2);
-            tr2_1 = _mm_srai_epi16(tr2_1, 2);
-            tr2_2 = _mm_srai_epi16(tr2_2, 2);
-            tr2_3 = _mm_srai_epi16(tr2_3, 2);
-            tr2_4 = _mm_srai_epi16(tr2_4, 2);
-            tr2_5 = _mm_srai_epi16(tr2_5, 2);
-            tr2_6 = _mm_srai_epi16(tr2_6, 2);
-            tr2_7 = _mm_srai_epi16(tr2_7, 2);
-          }
-          // Note: even though all these stores are aligned, using the aligned
-          //       intrinsic make the code slightly slower.
-          _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0);
-          _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1);
-          _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2);
-          _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3);
-          _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4);
-          _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5);
-          _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6);
-          _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7);
-          // Process next 8x8
-          output += 8;
-        }
-      }
-    }
-  }
-}
+#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vp9_short_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm
index 60f7991..db30660 100644
--- a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -36,6 +36,14 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pshufd                          m4, m4, 0
   mova                            m2, [quantq]             ; m2 = quant
   paddw                           m0, m4                   ; m0 = zbin + zbin_oq
+%ifidn %1, b_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m0, m5
+  paddw                           m1, m5
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
   mova                            m3, [r2q]                ; m3 = dequant
   psubw                           m0, [pw_1]
   mov                             r2, shiftmp
@@ -43,6 +51,9 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova                            m4, [r2]                 ; m4 = shift
   mov                             r4, dqcoeffmp
   mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
   pxor                            m5, m5                   ; m5 = dedicated zero
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
   lea                         coeffq, [  coeffq+ncoeffq*2]
@@ -56,16 +67,12 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
-%ifidn %1, b_32x32
-  paddw                           m6, m6
-  paddw                          m11, m11
-%endif
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   punpckhqdq                      m0, m0
   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-  paddw                           m6, m1                   ; m6 += round
+  paddsw                          m6, m1                   ; m6 += round
   punpckhqdq                      m1, m1
-  paddw                          m11, m1                   ; m11 += round
+  paddsw                         m11, m1                   ; m11 += round
   pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
   punpckhqdq                      m2, m2
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
@@ -112,10 +119,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
-%ifidn %1, b_32x32
-  paddw                           m6, m6
-  paddw                          m11, m11
-%endif
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
 %ifidn %1, b_32x32
@@ -124,8 +127,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   or                              r6, r2
   jz .skip_iter
 %endif
-  paddw                           m6, m1                   ; m6 += round
-  paddw                          m11, m1                   ; m11 += round
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
   pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   paddw                          m14, m6                   ; m14 += m6
@@ -164,6 +167,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmaxsw                          m8, m13
   add                        ncoeffq, mmsize
   jl .ac_only_loop
+
 %ifidn %1, b_32x32
   jmp .accumulate_eob
 .skip_iter:
diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
index 19e2feb..533456b 100644
--- a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -270,8 +270,13 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %if mmsize == 16
   movhps               m2, [srcq+src_strideq*2]
 %else ; mmsize == 8
+%if %1 == 4
+  movh                 m1, [srcq+src_strideq*2]
+  punpckldq            m2, m1
+%else
   punpckldq            m2, [srcq+src_strideq*2]
 %endif
+%endif
   movh                 m1, [dstq]
 %if mmsize == 16
   movlhps              m0, m2
@@ -542,9 +547,16 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
   movhps               m2, [srcq+src_strideq]
   movhps               m3, [srcq+src_strideq+1]
 %else
+%if %1 == 4
+  movh                 m1, [srcq+src_strideq]
+  punpckldq            m2, m1
+  movh                 m1, [srcq+src_strideq+1]
+  punpckldq            m3, m1
+%else
   punpckldq            m2, [srcq+src_strideq]
   punpckldq            m3, [srcq+src_strideq+1]
 %endif
+%endif
   pavgb                m2, m3
 %if mmsize == 16
   movlhps              m0, m2
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
index d3dbefe..3501cf1 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ b/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
@@ -342,8 +342,8 @@ sym(vp9_get4x4var_mmx):
         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
 
         ; Row 1
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
+        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
         punpcklbw   mm1, mm6
         psubsw      mm0, mm1                    ; A-B (low order) to MM0
@@ -351,12 +351,12 @@ sym(vp9_get4x4var_mmx):
         pmaddwd     mm0, mm0                    ; square and accumulate
         add         rbx,rdx                     ; Inc pointer into ref data
         add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
         paddd       mm7, mm0                    ; accumulate in mm7
 
 
         ; Row 2
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
         punpcklbw   mm1, mm6
         psubsw      mm0, mm1                    ; A-B (low order) to MM0
@@ -365,11 +365,11 @@ sym(vp9_get4x4var_mmx):
         pmaddwd     mm0, mm0                    ; square and accumulate
         add         rbx,rdx                     ; Inc pointer into ref data
         add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
         paddd       mm7, mm0                    ; accumulate in mm7
 
         ; Row 3
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
         punpcklbw   mm1, mm6
         psubsw      mm0, mm1                    ; A-B (low order) to MM0
@@ -378,11 +378,11 @@ sym(vp9_get4x4var_mmx):
         pmaddwd     mm0, mm0                    ; square and accumulate
         add         rbx,rdx                     ; Inc pointer into ref data
         add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
         paddd       mm7, mm0                    ; accumulate in mm7
 
         ; Row 4
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
 
         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
         punpcklbw   mm1, mm6
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
index b4ff850..cea934d 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
@@ -244,7 +244,7 @@ unsigned int vp9_variance16x16_sse2
   return (var - (((unsigned int)avg * avg) >> 8));
 }
 
-unsigned int vp9_mse16x16_wmt(
+unsigned int vp9_mse16x16_sse2(
   const unsigned char *src_ptr,
   int  source_stride,
   const unsigned char *ref_ptr,
@@ -500,7 +500,7 @@ FNS(ssse3, ssse3);
 #undef FNS
 #undef FN
 
-unsigned int vp9_variance_halfpixvar16x16_h_wmt(
+unsigned int vp9_variance_halfpixvar16x16_h_sse2(
   const unsigned char *src_ptr,
   int  src_pixels_per_line,
   const unsigned char *dst_ptr,
@@ -519,7 +519,7 @@ unsigned int vp9_variance_halfpixvar16x16_h_wmt(
 }
 
 
-unsigned int vp9_variance_halfpixvar16x16_v_wmt(
+unsigned int vp9_variance_halfpixvar16x16_v_sse2(
   const unsigned char *src_ptr,
   int  src_pixels_per_line,
   const unsigned char *dst_ptr,
@@ -537,7 +537,7 @@ unsigned int vp9_variance_halfpixvar16x16_v_wmt(
 }
 
 
-unsigned int vp9_variance_halfpixvar16x16_hv_wmt(
+unsigned int vp9_variance_halfpixvar16x16_hv_sse2(
   const unsigned char *src_ptr,
   int  src_pixels_per_line,
   const unsigned char *dst_ptr,
diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk
index b2b2a80..687fb48 100644
--- a/libvpx/vp9/vp9_common.mk
+++ b/libvpx/vp9/vp9_common.mk
@@ -49,6 +49,8 @@ VP9_COMMON_SRCS-yes += common/vp9_rtcd.c
 VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh
 VP9_COMMON_SRCS-yes += common/vp9_sadmxn.h
 VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h
+VP9_COMMON_SRCS-yes += common/vp9_scale.h
+VP9_COMMON_SRCS-yes += common/vp9_scale.c
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
@@ -71,29 +73,41 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
-VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
-VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
-ifeq ($(CONFIG_POSTPROC),yes)
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif
 
 ifeq ($(USE_X86INC),yes)
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
 endif
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_1_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_1_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct32x32_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht4x4_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM)
 
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh))
diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c
index be7828f..48866d2 100644
--- a/libvpx/vp9/vp9_cx_iface.c
+++ b/libvpx/vp9/vp9_cx_iface.c
@@ -51,14 +51,14 @@ static const struct extraconfig_map extracfg_map[] = {
     {
       NULL,
       0,                          /* cpu_used      */
-      0,                          /* enable_auto_alt_ref */
+      1,                          /* enable_auto_alt_ref */
       0,                          /* noise_sensitivity */
       0,                          /* Sharpness */
       0,                          /* static_thresh */
       0,                          /* tile_columns */
       0,                          /* tile_rows */
-      0,                          /* arnr_max_frames */
-      3,                          /* arnr_strength */
+      7,                          /* arnr_max_frames */
+      5,                          /* arnr_strength */
       3,                          /* arnr_type*/
       0,                          /* experimental mode */
       0,                          /* tuning*/
@@ -89,6 +89,18 @@ struct vpx_codec_alg_priv {
   unsigned int                fixed_kf_cntr;
 };
 
+static const VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
+  switch (frame) {
+    case VP8_LAST_FRAME:
+      return VP9_LAST_FLAG;
+    case VP8_GOLD_FRAME:
+      return VP9_GOLD_FLAG;
+    case VP8_ALTR_FRAME:
+      return VP9_ALT_FLAG;
+  }
+  assert(!"Invalid Reference Frame");
+  return VP9_LAST_FLAG;
+}
 
 static vpx_codec_err_t
 update_error_state(vpx_codec_alg_priv_t                 *ctx,
@@ -148,7 +160,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 
   RANGE_CHECK_HI(cfg, g_threads,          64);
   RANGE_CHECK_HI(cfg, g_lag_in_frames,    MAX_LAG_BUFFERS);
-  RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
+  RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
   RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
   RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
   RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
@@ -160,6 +172,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
   RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
   RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
 
+  RANGE_CHECK(cfg, ss_number_layers,      1,
+              VPX_SS_MAX_LAYERS); /*Spatial layers max */
   /* VP8 does not support a lower bound on the keyframe interval in
    * automatic keyframe placement mode.
    */
@@ -262,13 +276,15 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   // VBR only supported for now.
   // CBR code has been deprectated for experimental phase.
   // CQ mode not yet tested
-  oxcf->end_usage          = USAGE_LOCAL_FILE_PLAYBACK;
-  /*if (cfg.rc_end_usage == VPX_CQ)
-      oxcf->end_usage      = USAGE_CONSTRAINED_QUALITY;
-  else
-      oxcf->end_usage      = USAGE_LOCAL_FILE_PLAYBACK;*/
+  oxcf->end_usage        = USAGE_LOCAL_FILE_PLAYBACK;
+  /*
+  if (cfg.rc_end_usage == VPX_CQ)
+    oxcf->end_usage      = USAGE_CONSTRAINED_QUALITY;
+    */
+  if (cfg.rc_end_usage == VPX_Q)
+    oxcf->end_usage      = USAGE_CONSTANT_QUALITY;
 
-  oxcf->target_bandwidth       = cfg.rc_target_bitrate;
+  oxcf->target_bandwidth        = cfg.rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
 
   oxcf->best_allowed_q          = cfg.rc_min_quantizer;
@@ -317,6 +333,8 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
 
   oxcf->error_resilient_mode = cfg.g_error_resilient;
   oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode;
+
+  oxcf->ss_number_layers = cfg.ss_number_layers;
   /*
   printf("Current VP9 Settings: \n");
   printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
@@ -411,21 +429,22 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
 #define MAP(id, var) case id: var = CAST(id, args); break;
 
   switch (ctrl_id) {
-      MAP(VP8E_SET_CPUUSED,               xcfg.cpu_used);
-      MAP(VP8E_SET_ENABLEAUTOALTREF,      xcfg.enable_auto_alt_ref);
-      MAP(VP8E_SET_NOISE_SENSITIVITY,     xcfg.noise_sensitivity);
-      MAP(VP8E_SET_SHARPNESS,             xcfg.Sharpness);
-      MAP(VP8E_SET_STATIC_THRESHOLD,      xcfg.static_thresh);
-      MAP(VP9E_SET_TILE_COLUMNS,          xcfg.tile_columns);
-      MAP(VP9E_SET_TILE_ROWS,             xcfg.tile_rows);
-
-      MAP(VP8E_SET_ARNR_MAXFRAMES,        xcfg.arnr_max_frames);
-      MAP(VP8E_SET_ARNR_STRENGTH,        xcfg.arnr_strength);
-      MAP(VP8E_SET_ARNR_TYPE,        xcfg.arnr_type);
-      MAP(VP8E_SET_TUNING,                xcfg.tuning);
-      MAP(VP8E_SET_CQ_LEVEL,              xcfg.cq_level);
-      MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct);
-      MAP(VP9E_SET_LOSSLESS,              xcfg.lossless);
+      MAP(VP8E_SET_CPUUSED,                 xcfg.cpu_used);
+      MAP(VP8E_SET_ENABLEAUTOALTREF,        xcfg.enable_auto_alt_ref);
+      MAP(VP8E_SET_NOISE_SENSITIVITY,       xcfg.noise_sensitivity);
+      MAP(VP8E_SET_SHARPNESS,               xcfg.Sharpness);
+      MAP(VP8E_SET_STATIC_THRESHOLD,        xcfg.static_thresh);
+      MAP(VP9E_SET_TILE_COLUMNS,            xcfg.tile_columns);
+      MAP(VP9E_SET_TILE_ROWS,               xcfg.tile_rows);
+      MAP(VP8E_SET_ARNR_MAXFRAMES,          xcfg.arnr_max_frames);
+      MAP(VP8E_SET_ARNR_STRENGTH,           xcfg.arnr_strength);
+      MAP(VP8E_SET_ARNR_TYPE,               xcfg.arnr_type);
+      MAP(VP8E_SET_TUNING,                  xcfg.tuning);
+      MAP(VP8E_SET_CQ_LEVEL,                xcfg.cq_level);
+      MAP(VP9E_SET_MAX_Q,                   ctx->cfg.rc_max_quantizer);
+      MAP(VP9E_SET_MIN_Q,                   ctx->cfg.rc_min_quantizer);
+      MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT,   xcfg.rc_max_intra_bitrate_pct);
+      MAP(VP9E_SET_LOSSLESS,                xcfg.lossless);
       MAP(VP9E_SET_FRAME_PARALLEL_DECODING, xcfg.frame_parallel_decoding_mode);
   }
 
@@ -846,7 +865,8 @@ static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx,
     YV12_BUFFER_CONFIG sd;
 
     image2yuvconfig(&frame->img, &sd);
-    vp9_set_reference_enc(ctx->cpi, frame->frame_type, &sd);
+    vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type),
+                          &sd);
     return VPX_CODEC_OK;
   } else
     return VPX_CODEC_INVALID_PARAM;
@@ -864,7 +884,8 @@ static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx,
     YV12_BUFFER_CONFIG sd;
 
     image2yuvconfig(&frame->img, &sd);
-    vp9_copy_reference_enc(ctx->cpi, frame->frame_type, &sd);
+    vp9_copy_reference_enc(ctx->cpi,
+                           ref_frame_to_vp9_reframe(frame->frame_type), &sd);
     return VPX_CODEC_OK;
   } else
     return VPX_CODEC_INVALID_PARAM;
@@ -889,7 +910,7 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
   vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
   (void)ctr_id;
 
@@ -1005,6 +1026,68 @@ static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
     return VPX_CODEC_INVALID_PARAM;
 }
 
+static vpx_codec_err_t vp9e_set_width(vpx_codec_alg_priv_t *ctx, int ctr_id,
+                                      va_list args) {
+  unsigned int *data = va_arg(args, unsigned int *);
+  if (data) {
+    int res;
+    res = vp9_set_size_literal(ctx->cpi, *data, 0);
+    if (!res) {
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_INVALID_PARAM;
+    }
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t vp9e_set_height(vpx_codec_alg_priv_t *ctx,
+                                       int ctr_id,
+                                       va_list args) {
+  unsigned int *data =  va_arg(args, unsigned int *);
+
+  if (data) {
+    int res;
+    res = vp9_set_size_literal(ctx->cpi, 0, *data);
+
+    if (!res) {
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_INVALID_PARAM;
+    }
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t vp9e_set_layer(vpx_codec_alg_priv_t *ctx,
+                                      int ctr_id,
+                                      va_list args) {
+  unsigned int *data =  va_arg(args, unsigned int *);
+
+  if (data) {
+    int res;
+    res = 0;
+
+    res = vp9_switch_layer(ctx->cpi, *data);
+
+    if (!res) {
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_INVALID_PARAM;
+    }
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t vp9e_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id,
+                                    va_list args) {
+  int data = va_arg(args, int);
+  vp9_set_svc(ctx->cpi, data);
+  return VPX_CODEC_OK;
+}
 
 static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = {
   {VP8_SET_REFERENCE,                 vp9e_set_reference},
@@ -1026,14 +1109,20 @@ static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = {
   {VP8E_GET_LAST_QUANTIZER,           get_param},
   {VP8E_GET_LAST_QUANTIZER_64,        get_param},
   {VP8E_SET_ARNR_MAXFRAMES,           set_param},
-  {VP8E_SET_ARNR_STRENGTH,           set_param},
-  {VP8E_SET_ARNR_TYPE,           set_param},
+  {VP8E_SET_ARNR_STRENGTH,            set_param},
+  {VP8E_SET_ARNR_TYPE,                set_param},
   {VP8E_SET_TUNING,                   set_param},
   {VP8E_SET_CQ_LEVEL,                 set_param},
+  {VP9E_SET_MAX_Q,                    set_param},
+  {VP9E_SET_MIN_Q,                    set_param},
   {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_param},
   {VP9E_SET_LOSSLESS,                 set_param},
   {VP9E_SET_FRAME_PARALLEL_DECODING,  set_param},
   {VP9_GET_REFERENCE,                 get_reference},
+  {VP9E_SET_WIDTH,                    vp9e_set_width},
+  {VP9E_SET_HEIGHT,                   vp9e_set_height},
+  {VP9E_SET_LAYER,                    vp9e_set_layer},
+  {VP9E_SET_SVC,                      vp9e_set_svc},
   { -1, NULL},
 };
 
@@ -1053,7 +1142,7 @@ static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = {
 
       VPX_RC_ONE_PASS,    /* g_pass */
 
-      0,                  /* g_lag_in_frames */
+      25,                 /* g_lag_in_frames */
 
       0,                  /* rc_dropframe_thresh */
       0,                  /* rc_resize_allowed */
@@ -1065,7 +1154,7 @@ static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = {
       {0},                /* rc_twopass_stats_in */
 #endif
       256,                /* rc_target_bandwidth */
-      4,                  /* rc_min_quantizer */
+      0,                  /* rc_min_quantizer */
       63,                 /* rc_max_quantizer */
       100,                /* rc_undershoot_pct */
       100,                /* rc_overshoot_pct */
@@ -1076,13 +1165,15 @@ static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = {
 
       50,                 /* rc_two_pass_vbrbias  */
       0,                  /* rc_two_pass_vbrmin_section */
-      400,                /* rc_two_pass_vbrmax_section */
+      2000,               /* rc_two_pass_vbrmax_section */
 
       /* keyframing settings (kf) */
       VPX_KF_AUTO,        /* g_kfmode*/
       0,                  /* kf_min_dist */
       9999,               /* kf_max_dist */
 
+      VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
+
 #if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
       1,                  /* g_delete_first_pass_file */
       "vp8.fpf"           /* first pass filename */
diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c
index 05029b9..10b3238 100644
--- a/libvpx/vp9/vp9_dx_iface.c
+++ b/libvpx/vp9/vp9_dx_iface.c
@@ -15,11 +15,12 @@
 #include "vpx/vp8dx.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
-#include "decoder/vp9_onyxd.h"
-#include "decoder/vp9_onyxd_int.h"
+#include "vp9/decoder/vp9_onyxd.h"
+#include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/vp9_iface_common.h"
 
-#define VP9_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+#define VP9_CAP_POSTPROC (CONFIG_VP9_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
 typedef vpx_codec_stream_info_t  vp9_stream_info_t;
 
 /* Structures for handling memory allocations */
@@ -142,32 +143,64 @@ static vpx_codec_err_t vp9_destroy(vpx_codec_alg_priv_t *ctx) {
 static vpx_codec_err_t vp9_peek_si(const uint8_t         *data,
                                    unsigned int           data_sz,
                                    vpx_codec_stream_info_t *si) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
-
   if (data_sz <= 8) return VPX_CODEC_UNSUP_BITSTREAM;
+  if (data + data_sz <= data) return VPX_CODEC_INVALID_PARAM;
 
-  if (data + data_sz <= data) {
-    res = VPX_CODEC_INVALID_PARAM;
-  } else {
-    const int frame_marker = (data[0] >> 6) & 0x3;
-    const int version = (data[0] >> 4) & 0x3;
+  si->is_kf = 0;
+  si->w = si->h = 0;
+
+  {
+    struct vp9_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+    const int frame_marker = vp9_rb_read_literal(&rb, 2);
+    const int version = vp9_rb_read_bit(&rb) | (vp9_rb_read_bit(&rb) << 1);
     if (frame_marker != 0x2) return VPX_CODEC_UNSUP_BITSTREAM;
+#if CONFIG_NON420
+    if (version > 1) return VPX_CODEC_UNSUP_BITSTREAM;
+#else
     if (version != 0) return VPX_CODEC_UNSUP_BITSTREAM;
+#endif
+
+    if (vp9_rb_read_bit(&rb)) {  // show an existing frame
+      return VPX_CODEC_OK;
+    }
 
-    si->is_kf = !((data[0] >> 2) & 0x1);
+    si->is_kf = !vp9_rb_read_bit(&rb);
     if (si->is_kf) {
-      const uint8_t *c = data + 1;
+      const int sRGB = 7;
+      int colorspace;
 
-      if (c[0] != SYNC_CODE_0 || c[1] != SYNC_CODE_1 || c[2] != SYNC_CODE_2)
+      rb.bit_offset += 1;  // show frame
+      rb.bit_offset += 1;  // error resilient
+
+      if (vp9_rb_read_literal(&rb, 8) != SYNC_CODE_0 ||
+          vp9_rb_read_literal(&rb, 8) != SYNC_CODE_1 ||
+          vp9_rb_read_literal(&rb, 8) != SYNC_CODE_2) {
         return VPX_CODEC_UNSUP_BITSTREAM;
+      }
 
-      c += 3;
-      si->w = (((c[0] & 0xf) << 12) | (c[1] << 4) | ((c[2] >> 4) & 0xf)) + 1;
-      si->h = (((c[2] & 0xf) << 12) | (c[3] << 4) | ((c[4] >> 4) & 0xf)) + 1;
+      colorspace = vp9_rb_read_literal(&rb, 3);
+      if (colorspace != sRGB) {
+        rb.bit_offset += 1;  // [16,235] (including xvycc) vs [0,255] range
+        if (version == 1) {
+          rb.bit_offset += 2;  // subsampling x/y
+          rb.bit_offset += 1;  // has extra plane
+        }
+      } else {
+        if (version == 1) {
+          rb.bit_offset += 1;  // has extra plane
+        } else {
+          // RGB is only available in version 1
+          return VPX_CODEC_UNSUP_BITSTREAM;
+        }
+      }
+
+      // TODO(jzern): these are available on non-keyframes in intra only mode.
+      si->w = vp9_rb_read_literal(&rb, 16) + 1;
+      si->h = vp9_rb_read_literal(&rb, 16) + 1;
     }
   }
 
-  return res;
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t    *ctx,
@@ -368,6 +401,8 @@ static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t  *ctx,
   uint32_t sizes[8];
   int frames_this_pts, frame_count = 0;
 
+  if (data == NULL || data_sz == 0) return VPX_CODEC_INVALID_PARAM;
+
   parse_superframe_index(data, data_sz, sizes, &frames_this_pts);
 
   do {
@@ -561,7 +596,7 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx,
                                     int ctr_id,
                                     va_list args) {
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
   vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
 
   if (data) {
diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk
index 288c0d8..9fbf100 100644
--- a/libvpx/vp9/vp9cx.mk
+++ b/libvpx/vp9/vp9cx.mk
@@ -64,7 +64,7 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
 VP9_CX_SRCS-yes += encoder/vp9_variance_c.c
-ifeq ($(CONFIG_POSTPROC),yes)
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
 endif
@@ -78,18 +78,18 @@ VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 
 ifeq ($(USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 endif
 
 ifeq ($(ARCH_X86_64),yes)
@@ -100,5 +100,6 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/libvpx/vp9_spatial_scalable_encoder.c b/libvpx/vp9_spatial_scalable_encoder.c
new file mode 100644
index 0000000..8bb582f
--- /dev/null
+++ b/libvpx/vp9_spatial_scalable_encoder.c
@@ -0,0 +1,487 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * This is an example demonstrating how to implement a multi-layer
+ * VP9 encoding scheme based on spatial scalability for video applications
+ * that benefit from a scalable bitstream.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <time.h>
+#include <string.h>
+#include <unistd.h>
+#include <libgen.h>
+#define VPX_CODEC_DISABLE_COMPAT 1
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
+#define interface (vpx_codec_vp9_cx())
+#define fourcc 0x30395056
+#define IVF_FILE_HDR_SZ (32)
+#define IVF_FRAME_HDR_SZ (12)
+#define NUM_BUFFERS 8
+
+char *input_filename;
+char *output_filename;
+unsigned int number_frames_to_code = 60 * 60;
+unsigned int number_frames_to_skip = 0;
+unsigned int number_spatial_layers = 5;
+unsigned int key_period = 100;
+
+typedef enum ENCODING_MODE {
+  INTER_LAYER_PREDICTION_I,
+  INTER_LAYER_PREDICTION_IP,
+  USE_GOLDEN_FRAME
+} ENCODING_MODE;
+
+static void mem_put_le16(char *mem, unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+}
+
+static void mem_put_le32(char *mem, unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+  mem[2] = val >> 16;
+  mem[3] = val >> 24;
+}
+
+static void usage(char *program_name) {
+  printf(
+      "Usage: %s [-f frames] [-s skip_frames] [-w width] [-h height] \n\t"
+      "[-n rate_num] [-d rate_den] [-b bitrate] [-l layers] "
+      "<input_filename> <output_filename>\n",
+      basename(program_name));
+  exit(EXIT_FAILURE);
+}
+
+static void die(const char *fmt, ...) {
+  va_list ap;
+
+  va_start(ap, fmt);
+  vprintf(fmt, ap);
+  if (fmt[strlen(fmt) - 1] != '\n') printf("\n");
+  exit(EXIT_FAILURE);
+}
+
+static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
+  const char *detail = vpx_codec_error_detail(ctx);
+
+  printf("%s: %s\n", s, vpx_codec_error(ctx));
+  if (detail) printf("    %s\n", detail);
+  exit(EXIT_FAILURE);
+}
+
+static int read_frame(FILE *f, vpx_image_t *img) {
+  size_t nbytes, to_read;
+  int res = 1;
+
+  to_read = img->w * img->h * 3 / 2;
+  nbytes = fread(img->planes[0], 1, to_read, f);
+  if (nbytes != to_read) {
+    res = 0;
+    if (nbytes > 0)
+      printf("Warning: Read partial frame. Check your width & height!\n");
+  }
+  return res;
+}
+
+static int read_dummy_frame(vpx_image_t *img) {
+  size_t to_read;
+
+  to_read = img->w * img->h * 3 / 2;
+  memset(img->planes[0], 129, to_read);
+  return 1;
+}
+
+static void write_ivf_file_header(FILE *outfile, const vpx_codec_enc_cfg_t *cfg,
+                                  int frame_cnt) {
+  char header[32];
+
+  if (cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS) return;
+  header[0] = 'D';
+  header[1] = 'K';
+  header[2] = 'I';
+  header[3] = 'F';
+  mem_put_le16(header + 4, 0);                    /* version */
+  mem_put_le16(header + 6, 32);                   /* headersize */
+  mem_put_le32(header + 8, fourcc);               /* headersize */
+  mem_put_le16(header + 12, cfg->g_w);            /* width */
+  mem_put_le16(header + 14, cfg->g_h);            /* height */
+  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
+  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
+  mem_put_le32(header + 24, frame_cnt);           /* length */
+  mem_put_le32(header + 28, 0);                   /* unused */
+
+  (void)fwrite(header, 1, 32, outfile);
+}
+
+static void write_ivf_frame_header(FILE *outfile,
+                                   const vpx_codec_cx_pkt_t *pkt) {
+  char header[12];
+  vpx_codec_pts_t pts;
+
+  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return;
+
+  pts = pkt->data.frame.pts;
+  mem_put_le32(header, pkt->data.frame.sz);
+  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
+  mem_put_le32(header + 8, pts >> 32);
+
+  (void)fwrite(header, 1, 12, outfile);
+}
+
+static void check_parameters() {
+  if (number_spatial_layers > 5) die("Cannot support more than 5 layers");
+}
+
+static void parse_command_line(int argc, char **argv,
+                               vpx_codec_enc_cfg_t *cfg) {
+  unsigned int width = 1920;
+  unsigned int height = 1080;
+  unsigned int timebase_num = 1;
+  unsigned int timebase_den = 60;
+  unsigned int bitrate = 1000;
+  int c;
+  vpx_codec_err_t res;
+
+  opterr = 0;
+  while ((c = getopt(argc, argv, "f:w:h:n:d:b:s:l:p:")) != -1) switch (c) {
+      case 'f':
+        number_frames_to_code = atoi(optarg);
+        break;
+      case 'w':
+        width = atoi(optarg);
+        break;
+      case 'h':
+        height = atoi(optarg);
+        break;
+      case 'n':
+        timebase_num = atoi(optarg);
+        break;
+      case 'd':
+        timebase_den = atoi(optarg);
+        break;
+      case 'b':
+        bitrate = atoi(optarg);
+        break;
+      case 's':
+        number_frames_to_skip = atoi(optarg);
+        break;
+      case 'l':
+        number_spatial_layers = atoi(optarg);
+        break;
+      case 'p':
+        key_period = atoi(optarg);
+        break;
+      case '?':
+        usage(argv[0]);
+    }
+
+  // Parse required parameters
+  if (argc - optind != 2) {
+    usage(argv[0]);
+  }
+
+  input_filename = argv[optind];
+  output_filename = argv[optind + 1];
+
+  if (width < 16 || width % 2 || height < 16 || height % 2)
+    die("Invalid resolution: %d x %d", width, height);
+
+  /* Populate encoder configuration */
+  res = vpx_codec_enc_config_default(interface, cfg, 0);
+  if (res) {
+    die("Failed to get config: %s\n", vpx_codec_err_to_string(res));
+  }
+  printf(
+      "Codec %s\nframes: %d, skip: %d, layers: %d\n"
+      "width %d, height: %d, \n"
+      "num: %d, den: %d, bitrate: %d, \n"
+      "key period: %d \n",
+      vpx_codec_iface_name(interface), number_frames_to_code,
+      number_frames_to_skip, number_spatial_layers, width, height, timebase_num,
+      timebase_den, bitrate, key_period);
+
+  // Do minimal check at the application level. Encoder parameters will be
+  // checked internally
+  check_parameters();
+
+  cfg->rc_target_bitrate = bitrate;
+  cfg->g_w = width;
+  cfg->g_h = height;
+  cfg->g_timebase.num = timebase_num;
+  cfg->g_timebase.den = timebase_den;
+  cfg->ss_number_layers = number_spatial_layers;
+}
+
+static void set_default_configuration(vpx_codec_enc_cfg_t *cfg) {
+  /* Real time parameters */
+  cfg->rc_dropframe_thresh = 0;
+  cfg->rc_end_usage = VPX_CBR;
+  cfg->rc_resize_allowed = 0;
+  cfg->rc_min_quantizer = 33;
+  cfg->rc_max_quantizer = 33;
+  cfg->rc_undershoot_pct = 100;
+  cfg->rc_overshoot_pct = 15;
+  cfg->rc_buf_initial_sz = 500;
+  cfg->rc_buf_optimal_sz = 600;
+  cfg->rc_buf_sz = 1000;
+
+  /* Enable error resilient mode */
+  cfg->g_error_resilient = 1;
+  cfg->g_lag_in_frames = 0;
+
+  /* Disable automatic keyframe placement */
+  cfg->kf_mode = VPX_KF_DISABLED;
+  cfg->kf_min_dist = cfg->kf_max_dist = 3000;
+}
+
+static void initialize_codec(vpx_codec_ctx_t *codec, vpx_codec_enc_cfg_t *cfg) {
+  int max_intra_size_pct;
+
+  /* Initialize codec */
+  if (vpx_codec_enc_init(codec, interface, cfg, VPX_CODEC_USE_PSNR))
+    die_codec(codec, "Failed to initialize encoder");
+
+  vpx_codec_control(codec, VP9E_SET_SVC, 1);
+  /* Cap CPU & first I-frame size */
+  vpx_codec_control(codec, VP8E_SET_CPUUSED, 1);
+  vpx_codec_control(codec, VP8E_SET_STATIC_THRESHOLD, 1);
+  vpx_codec_control(codec, VP8E_SET_NOISE_SENSITIVITY, 1);
+  vpx_codec_control(codec, VP8E_SET_TOKEN_PARTITIONS, 1);
+
+  max_intra_size_pct =
+      (int)(((double)cfg->rc_buf_optimal_sz * 0.5) *
+            ((double)cfg->g_timebase.den / cfg->g_timebase.num) / 10.0);
+  /* printf ("max_intra_size_pct=%d\n", max_intra_size_pct); */
+
+  vpx_codec_control(codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct);
+}
+
+static int calculate_layer(int frame_cnt, int number_spatial_layers) {
+  if (frame_cnt == 0)
+    return 0;
+  else
+    return (frame_cnt + number_spatial_layers - 1) % number_spatial_layers;
+}
+
+static void switch_to_layer(int layer, unsigned int initial_width,
+                            unsigned int initial_height,
+                            vpx_codec_ctx_t *codec) {
+  // Set layer size
+  int scaling_factor_num[MAX_LAYERS] = {2, 1, 4, 2, 1};
+  int scaling_factor_den[MAX_LAYERS] = {9, 3, 9, 3, 1};
+
+  int quantizer[MAX_LAYERS] = {60, 53, 39, 33, 27};
+
+  unsigned int current_width;
+  unsigned int current_height;
+
+  current_width = initial_width *
+                  scaling_factor_num[layer + 5 - number_spatial_layers] /
+                  scaling_factor_den[layer + 5 - number_spatial_layers];
+  current_height = initial_height *
+                   scaling_factor_num[layer + 5 - number_spatial_layers] /
+                   scaling_factor_den[layer + 5 - number_spatial_layers];
+
+  current_width += current_width % 2;
+  current_height += current_height % 2;
+
+  vpx_codec_control(codec, VP9E_SET_WIDTH, &current_width);
+  vpx_codec_control(codec, VP9E_SET_HEIGHT, &current_height);
+
+  // Set layer context
+  vpx_codec_control(codec, VP9E_SET_LAYER, &layer);
+  vpx_codec_control(codec, VP9E_SET_MAX_Q,
+                    quantizer[layer + 5 - number_spatial_layers]);
+  vpx_codec_control(codec, VP9E_SET_MIN_Q,
+                    quantizer[layer + 5 - number_spatial_layers]);
+}
+
+static int get_flag(int is_I_frame_in_layer, int layer, ENCODING_MODE mode) {
+  // First layer
+  switch (mode) {
+    case INTER_LAYER_PREDICTION_I:
+      if (is_I_frame_in_layer && layer == 0) return VPX_EFLAG_FORCE_KF;
+      if (layer == 0)
+        return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+               VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+      else if (is_I_frame_in_layer)
+        return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+               VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_LAST;
+      else
+        return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+               VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+      break;
+
+    case INTER_LAYER_PREDICTION_IP:
+      if (is_I_frame_in_layer && layer == 0) return VPX_EFLAG_FORCE_KF;
+      if (layer == 0)
+        return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+               VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+      else if (is_I_frame_in_layer)
+        return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+               VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_LAST;
+      else
+        return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF;
+      break;
+
+    case USE_GOLDEN_FRAME:
+      if (is_I_frame_in_layer && layer == 0) return VPX_EFLAG_FORCE_KF;
+      if (2 * number_spatial_layers - NUM_BUFFERS <= layer) {
+        if (layer == 0)
+          return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+                 VP8_EFLAG_NO_REF_ARF;
+        else if (is_I_frame_in_layer)
+          return VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF |
+                 VP8_EFLAG_NO_REF_LAST;
+        else
+          return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        if (layer == 0)
+          return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+                 VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+        else if (is_I_frame_in_layer)
+          return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+                 VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_LAST;
+        else
+          return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+                 VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+      }
+      break;
+    default:
+      return VPX_EFLAG_FORCE_KF;
+  }
+}
+
+int main(int argc, char **argv) {
+  FILE *infile, *outfile[MAX_LAYERS];
+  vpx_codec_ctx_t codec;
+  vpx_codec_enc_cfg_t cfg;
+  int frame_cnt = 0;
+  vpx_image_t raw;
+  int frame_avail = 1;
+  int got_data = 0;
+  int i;
+  int frames_in_layer[MAX_LAYERS] = {0};
+  clock_t before;
+  clock_t after;
+  int pts = 0;            /* PTS starts at 0 */
+  int frame_duration = 1; /* 1 timebase tick per frame */
+
+  parse_command_line(argc, argv, &cfg);
+
+  // Allocate image buffer
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 32))
+    die("Failed to allocate image", cfg.g_w, cfg.g_h);
+
+  set_default_configuration(&cfg);
+
+  /* Open input file */
+  if (!(infile = fopen(input_filename, "rb")))
+    die("Failed to open %s for reading", argv[1]);
+
+  /* Open output file  */
+  for (i = 0; i < number_spatial_layers; i++) {
+    char file_name[512];
+    snprintf(file_name, sizeof(file_name), "%s_%d.ivf", output_filename, i);
+    if (!(outfile[i] = fopen(file_name, "wb")))
+      die("Failed to open %s for writing", file_name);
+    write_ivf_file_header(outfile[i], &cfg, 0);
+  }
+
+  initialize_codec(&codec, &cfg);
+
+  // skip initial frames
+  for (i = 0; i < number_frames_to_skip; i++) {
+    read_frame(infile, &raw);
+  }
+
+  before = clock();
+  // Encoding frames
+  while ((frame_avail || got_data) &&
+         frame_cnt <= number_frames_to_code * number_spatial_layers) {
+    int flags = 0;
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *pkt;
+
+    int layer = calculate_layer(frame_cnt, number_spatial_layers);
+    int is_I_frame_in_layer =
+        (((frame_cnt - 1) / number_spatial_layers % key_period) == 0);
+    int is_dummy = (frame_cnt == 0);
+
+    if (is_dummy) {  // Dummy frame
+      flags = VPX_EFLAG_FORCE_KF;
+      frame_avail = read_dummy_frame(&raw);
+
+    } else {  // Regular frame
+      // Read a new frame only at the base layer
+      if (layer == 0) frame_avail = read_frame(infile, &raw);
+      switch_to_layer(layer, cfg.g_w, cfg.g_h, &codec);
+      flags = get_flag(is_I_frame_in_layer, layer, INTER_LAYER_PREDICTION_I);
+    }
+
+    // Actual Encoding
+    if (vpx_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags,
+                         VPX_DL_REALTIME))
+      die_codec(&codec, "Failed to encode frame");
+
+    got_data = 0;
+    // Process data / Get PSNR statistics
+    while ((pkt = vpx_codec_get_cx_data(&codec, &iter))) {
+      got_data = 1;
+      switch (pkt->kind) {
+        case VPX_CODEC_CX_FRAME_PKT:
+          for (i = layer; i < number_spatial_layers; i++) {
+            write_ivf_frame_header(outfile[i], pkt);
+            (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+                         outfile[i]);
+            frames_in_layer[i]++;
+          }
+          break;
+        case VPX_CODEC_PSNR_PKT:
+          if (frame_cnt != 0)
+            printf(
+                "Processed Frame %d, layer %d, PSNR(Total/Y/U/V): "
+                "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                (frame_cnt - 1) / number_spatial_layers + 1, layer,
+                pkt->data.psnr.psnr[0], pkt->data.psnr.psnr[1],
+                pkt->data.psnr.psnr[2], pkt->data.psnr.psnr[3]);
+          break;
+        default:
+          break;
+      }
+    }
+    frame_cnt++;
+    // TODO(ivan): Modify ts later if(!layer)
+    pts += frame_duration;
+  }
+  // end while
+
+  after = clock();
+  printf("Processed %d frames in different resolutions in %ld ms.\n",
+         frame_cnt - 1, (int)(after - before) / (CLOCKS_PER_SEC / 1000));
+
+  fclose(infile);
+
+  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+
+  /* Try to rewrite the output file headers with the actual frame count */
+  for (i = 0; i < number_spatial_layers; i++) {
+    if (!fseek(outfile[i], 0, SEEK_SET)) {
+      write_ivf_file_header(outfile[i], &cfg, frames_in_layer[i]);
+    }
+    fclose(outfile[i]);
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/libvpx/vpx/vp8cx.h b/libvpx/vpx/vp8cx.h
index f8e2ef9..f3ea6d3 100644
--- a/libvpx/vpx/vp8cx.h
+++ b/libvpx/vpx/vp8cx.h
@@ -190,7 +190,15 @@ enum vp8e_enc_control_id {
   VP9E_SET_LOSSLESS,
   VP9E_SET_TILE_COLUMNS,
   VP9E_SET_TILE_ROWS,
-  VP9E_SET_FRAME_PARALLEL_DECODING
+  VP9E_SET_FRAME_PARALLEL_DECODING,
+
+  VP9E_SET_WIDTH              = 99,
+  VP9E_SET_HEIGHT,
+  VP9E_SET_LAYER,
+  VP9E_SET_SVC,
+
+  VP9E_SET_MAX_Q,
+  VP9E_SET_MIN_Q
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -292,6 +300,12 @@ VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP,            vpx_roi_map_t *)
 VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP,          vpx_active_map_t *)
 VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE,          vpx_scaling_mode_t *)
 
+VPX_CTRL_USE_TYPE(VP9E_SET_LAYER,              int *)
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC,                int)
+
+VPX_CTRL_USE_TYPE(VP9E_SET_WIDTH,              unsigned int *)
+VPX_CTRL_USE_TYPE(VP9E_SET_HEIGHT,             unsigned int *)
+
 VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED,            int)
 VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF,   unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_NOISE_SENSITIVITY,  unsigned int)
@@ -316,6 +330,9 @@ VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
 VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
 
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int)
+
+VPX_CTRL_USE_TYPE(VP9E_SET_MAX_Q,      unsigned int)
+VPX_CTRL_USE_TYPE(VP9E_SET_MIN_Q,      unsigned int)
 /*! @} - end defgroup vp8_encoder */
 #include "vpx_codec_impl_bottom.h"
 #endif
diff --git a/libvpx/vpx/vpx_encoder.h b/libvpx/vpx/vpx_encoder.h
index ffdbc06..56fd2d9 100644
--- a/libvpx/vpx/vpx_encoder.h
+++ b/libvpx/vpx/vpx_encoder.h
@@ -46,6 +46,12 @@ extern "C" {
   /*!\deprecated Use #VPX_TS_MAX_LAYERS instead. */
 #define MAX_LAYERS      VPX_TS_MAX_LAYERS
 
+/*! Spatial Scalability: Maximum number of coding layers */
+#define VPX_SS_MAX_LAYERS       5
+
+/*! Spatial Scalability: Default number of coding layers */
+#define VPX_SS_DEFAULT_LAYERS       3
+
   /*!\brief Current ABI version number
    *
    * \internal
@@ -217,9 +223,10 @@ extern "C" {
 
   /*!\brief Rate control mode */
   enum vpx_rc_mode {
-    VPX_VBR, /**< Variable Bit Rate (VBR) mode */
+    VPX_VBR,  /**< Variable Bit Rate (VBR) mode */
     VPX_CBR,  /**< Constant Bit Rate (CBR) mode */
-    VPX_CQ   /**< Constant Quality  (CQ)  mode */
+    VPX_CQ,   /**< Constrained Quality (CQ)  mode */
+    VPX_Q,    /**< Constant Quality (Q) mode */
   };
 
 
@@ -595,8 +602,14 @@ extern "C" {
     unsigned int           kf_max_dist;
 
     /*
-     * Temporal scalability settings (ts)
+     * Spatial scalability settings (ss)
+     */
+
+    /*!\brief Number of coding layers (spatial)
+     *
+     * This value specifies the number of coding layers to be used.
      */
+    unsigned int           ss_number_layers;
 
     /*!\brief Number of coding layers
      *
diff --git a/libvpx/vpx_mem/include/vpx_mem_intrnl.h b/libvpx/vpx_mem/include/vpx_mem_intrnl.h
index 60b5165..2248ad5 100644
--- a/libvpx/vpx_mem/include/vpx_mem_intrnl.h
+++ b/libvpx/vpx_mem/include/vpx_mem_intrnl.h
@@ -50,14 +50,10 @@ vpx_memcpy, _memset, and _memmove*/
 calls to vpx_* functions other
 than vpx_memalign*/
 # else
-#  define DEFAULT_ALIGNMENT        1
+#  define DEFAULT_ALIGNMENT        (2 * sizeof(void*))  /* NOLINT */
 # endif
 #endif
 
-#if DEFAULT_ALIGNMENT < 1
-# error "DEFAULT_ALIGNMENT must be >= 1!"
-#endif
-
 #if CONFIG_MEM_TRACKER
 # define TRY_BOUNDS_CHECK         1        /*when set to 1 pads each allocation,
 integrity can be checked using
diff --git a/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm b/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm
index cc1789a..d070a47 100644
--- a/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm
+++ b/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm
@@ -9,7 +9,7 @@
 ;
 
 
-    EXPORT  |vp8_yv12_copy_y_neon|
+    EXPORT  |vpx_yv12_copy_y_neon|
 
     ARM
     REQUIRE8
@@ -19,8 +19,9 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vpxyv12_copy_y_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
-|vp8_yv12_copy_y_neon| PROC
+;void vpx_yv12_copy_y_neon(const YV12_BUFFER_CONFIG *src_ybc,
+;                          YV12_BUFFER_CONFIG *dst_ybc)
+|vpx_yv12_copy_y_neon| PROC
     push            {r4 - r11, lr}
     vpush           {d8-d15}
 
diff --git a/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm b/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
index 3f17883..696f47a 100644
--- a/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
+++ b/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
@@ -18,7 +18,7 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,
+;void vp8_yv12_copy_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc,
 ;                                   YV12_BUFFER_CONFIG *dst_ybc);
 
 |vp8_yv12_copy_frame_func_neon| PROC
diff --git a/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm b/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
index d452ad2..d3306b6 100644
--- a/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
+++ b/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
@@ -17,11 +17,12 @@
     INCLUDE vpx_scale_asm_offsets.asm
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: This function is used to copy source data in src_buffer[i] at beginning of
-;the encoding. The buffer has a width and height of cpi->oxcf.Width and cpi->oxcf.Height,
-;which can be ANY numbers(NOT always multiples of 16 or 4).
+;Note: This function is used to copy source data in src_buffer[i] at beginning
+;of the encoding. The buffer has a width and height of cpi->oxcf.Width and
+;cpi->oxcf.Height, which can be ANY numbers(NOT always multiples of 16 or 4).
 
-;void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+;void vp8_yv12_copy_src_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc,
+;                                       YV12_BUFFER_CONFIG *dst_ybc);
 
 |vp8_yv12_copy_src_frame_func_neon| PROC
     push            {r4 - r11, lr}
diff --git a/libvpx/vpx_scale/arm/neon/yv12extend_arm.c b/libvpx/vpx_scale/arm/neon/yv12extend_arm.c
index 4535b8f..fac7bbc 100644
--- a/libvpx/vpx_scale/arm/neon/yv12extend_arm.c
+++ b/libvpx/vpx_scale/arm/neon/yv12extend_arm.c
@@ -10,12 +10,12 @@
 
 #include "./vpx_scale_rtcd.h"
 
-extern void vp8_yv12_copy_frame_func_neon(struct yv12_buffer_config *src_ybc,
-                                          struct yv12_buffer_config *dst_ybc);
+extern void vp8_yv12_copy_frame_func_neon(
+    const struct yv12_buffer_config *src_ybc,
+    struct yv12_buffer_config *dst_ybc);
 
-void vp8_yv12_copy_frame_neon(struct yv12_buffer_config *src_ybc,
+void vp8_yv12_copy_frame_neon(const struct yv12_buffer_config *src_ybc,
                               struct yv12_buffer_config *dst_ybc) {
   vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc);
-
   vp8_yv12_extend_frame_borders_neon(dst_ybc);
 }
diff --git a/libvpx/vpx_scale/generic/yv12config.c b/libvpx/vpx_scale/generic/yv12config.c
index 2592040..a89e29d 100644
--- a/libvpx/vpx_scale/generic/yv12config.c
+++ b/libvpx/vpx_scale/generic/yv12config.c
@@ -192,7 +192,7 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
     ybf->alpha_buffer = ybf->buffer_alloc + yplane_size + 2 * uvplane_size +
                         (alpha_border_h * alpha_stride) + alpha_border_w;
 #endif
-    ybf->corrupted = 0; /* assume not currupted by errors */
+    ybf->corrupted = 0; /* assume not corrupted by errors */
     return 0;
   }
   return -2;
diff --git a/libvpx/vpx_scale/generic/yv12extend.c b/libvpx/vpx_scale/generic/yv12extend.c
index cc8da2a..f2aec2b 100644
--- a/libvpx/vpx_scale/generic/yv12extend.c
+++ b/libvpx/vpx_scale/generic/yv12extend.c
@@ -10,67 +10,52 @@
 
 #include <assert.h>
 #include "./vpx_config.h"
-#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/vpx_scale.h"
-
-/****************************************************************************
-*  Exports
-****************************************************************************/
+#include "vpx_scale/yv12config.h"
 
-/****************************************************************************
- *
- ****************************************************************************/
-static void extend_plane(uint8_t *s,       /* source */
-                         int sp,           /* source pitch */
-                         int w,            /* width */
-                         int h,            /* height */
-                         int et,           /* extend top border */
-                         int el,           /* extend left border */
-                         int eb,           /* extend bottom border */
-                         int er) {         /* extend right border */
+static void extend_plane(uint8_t *const src, int src_stride,
+                         int width, int height,
+                         int extend_top, int extend_left,
+                         int extend_bottom, int extend_right) {
   int i;
-  uint8_t *src_ptr1, *src_ptr2;
-  uint8_t *dest_ptr1, *dest_ptr2;
-  int linesize;
+  const int linesize = extend_left + extend_right + width;
 
   /* copy the left and right most columns out */
-  src_ptr1 = s;
-  src_ptr2 = s + w - 1;
-  dest_ptr1 = s - el;
-  dest_ptr2 = s + w;
-
-  for (i = 0; i < h; i++) {
-    vpx_memset(dest_ptr1, src_ptr1[0], el);
-    vpx_memset(dest_ptr2, src_ptr2[0], er);
-    src_ptr1  += sp;
-    src_ptr2  += sp;
-    dest_ptr1 += sp;
-    dest_ptr2 += sp;
+  uint8_t *src_ptr1 = src;
+  uint8_t *src_ptr2 = src + width - 1;
+  uint8_t *dst_ptr1 = src - extend_left;
+  uint8_t *dst_ptr2 = src + width;
+
+  for (i = 0; i < height; ++i) {
+    vpx_memset(dst_ptr1, src_ptr1[0], extend_left);
+    vpx_memset(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_stride;
+    src_ptr2 += src_stride;
+    dst_ptr1 += src_stride;
+    dst_ptr2 += src_stride;
   }
 
   /* Now copy the top and bottom lines into each line of the respective
    * borders
    */
-  src_ptr1 = s - el;
-  src_ptr2 = s + sp * (h - 1) - el;
-  dest_ptr1 = s + sp * (-et) - el;
-  dest_ptr2 = s + sp * (h) - el;
-  linesize = el + er + w;
-
-  for (i = 0; i < et; i++) {
-    vpx_memcpy(dest_ptr1, src_ptr1, linesize);
-    dest_ptr1 += sp;
+  src_ptr1 = src - extend_left;
+  src_ptr2 = src + src_stride * (height - 1) - extend_left;
+  dst_ptr1 = src + src_stride * -extend_top - extend_left;
+  dst_ptr2 = src + src_stride * height - extend_left;
+
+  for (i = 0; i < extend_top; ++i) {
+    vpx_memcpy(dst_ptr1, src_ptr1, linesize);
+    dst_ptr1 += src_stride;
   }
 
-  for (i = 0; i < eb; i++) {
-    vpx_memcpy(dest_ptr2, src_ptr2, linesize);
-    dest_ptr2 += sp;
+  for (i = 0; i < extend_bottom; ++i) {
+    vpx_memcpy(dst_ptr2, src_ptr2, linesize);
+    dst_ptr2 += src_stride;
   }
 }
 
-void
-vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
+void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
   assert(ybf->y_height - ybf->y_crop_height < 16);
   assert(ybf->y_width - ybf->y_crop_width < 16);
   assert(ybf->y_height - ybf->y_crop_height >= 0);
@@ -96,9 +81,9 @@ vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
 }
 
 #if CONFIG_VP9
-static void extend_frame(YV12_BUFFER_CONFIG *ybf,
-                        int subsampling_x, int subsampling_y,
-                        int ext_size) {
+static void extend_frame(YV12_BUFFER_CONFIG *const ybf,
+                         int subsampling_x, int subsampling_y,
+                         int ext_size) {
   const int c_w = (ybf->y_crop_width + subsampling_x) >> subsampling_x;
   const int c_h = (ybf->y_crop_height + subsampling_y) >> subsampling_y;
   const int c_et = ext_size >> subsampling_y;
@@ -126,7 +111,6 @@ static void extend_frame(YV12_BUFFER_CONFIG *ybf,
                c_w, c_h, c_et, c_el, c_eb, c_er);
 }
 
-
 void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
                                 int subsampling_x, int subsampling_y) {
   extend_frame(ybf, subsampling_x, subsampling_y, ybf->border);
@@ -134,33 +118,20 @@ void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
 
 void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf,
                                       int subsampling_x, int subsampling_y) {
-  const int inner_bw = ybf->border > VP9INNERBORDERINPIXLES ?
-                       VP9INNERBORDERINPIXLES : ybf->border;
+  const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) ?
+                       VP9INNERBORDERINPIXELS : ybf->border;
   extend_frame(ybf, subsampling_x, subsampling_y, inner_bw);
 }
-#endif
+#endif  // CONFIG_VP9
 
-/****************************************************************************
- *
- *  ROUTINE       : vp8_yv12_copy_frame
- *
- *  INPUTS        :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies the source image into the destination image and
- *                  updates the destination's UMV borders.
- *
- *  SPECIAL NOTES : The frames are assumed to be identical in size.
- *
- ****************************************************************************/
-void
-vp8_yv12_copy_frame_c(YV12_BUFFER_CONFIG *src_ybc,
-                      YV12_BUFFER_CONFIG *dst_ybc) {
+// Copies the source image into the destination image and updates the
+// destination's UMV borders.
+// Note: The frames are assumed to be identical in size.
+void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc) {
   int row;
-  unsigned char *source, *dest;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
 
 #if 0
   /* These assertions are valid in the codec, but the libvpx-tester uses
@@ -170,48 +141,42 @@ vp8_yv12_copy_frame_c(YV12_BUFFER_CONFIG *src_ybc,
   assert(src_ybc->y_height == dst_ybc->y_height);
 #endif
 
-  source = src_ybc->y_buffer;
-  dest = dst_ybc->y_buffer;
-
-  for (row = 0; row < src_ybc->y_height; row++) {
-    vpx_memcpy(dest, source, src_ybc->y_width);
-    source += src_ybc->y_stride;
-    dest   += dst_ybc->y_stride;
+  for (row = 0; row < src_ybc->y_height; ++row) {
+    vpx_memcpy(dst, src, src_ybc->y_width);
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
   }
 
-  source = src_ybc->u_buffer;
-  dest = dst_ybc->u_buffer;
+  src = src_ybc->u_buffer;
+  dst = dst_ybc->u_buffer;
 
-  for (row = 0; row < src_ybc->uv_height; row++) {
-    vpx_memcpy(dest, source, src_ybc->uv_width);
-    source += src_ybc->uv_stride;
-    dest   += dst_ybc->uv_stride;
+  for (row = 0; row < src_ybc->uv_height; ++row) {
+    vpx_memcpy(dst, src, src_ybc->uv_width);
+    src += src_ybc->uv_stride;
+    dst += dst_ybc->uv_stride;
   }
 
-  source = src_ybc->v_buffer;
-  dest = dst_ybc->v_buffer;
+  src = src_ybc->v_buffer;
+  dst = dst_ybc->v_buffer;
 
-  for (row = 0; row < src_ybc->uv_height; row++) {
-    vpx_memcpy(dest, source, src_ybc->uv_width);
-    source += src_ybc->uv_stride;
-    dest   += dst_ybc->uv_stride;
+  for (row = 0; row < src_ybc->uv_height; ++row) {
+    vpx_memcpy(dst, src, src_ybc->uv_width);
+    src += src_ybc->uv_stride;
+    dst += dst_ybc->uv_stride;
   }
 
   vp8_yv12_extend_frame_borders_c(dst_ybc);
 }
 
-void vp8_yv12_copy_y_c(YV12_BUFFER_CONFIG *src_ybc,
+void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
                        YV12_BUFFER_CONFIG *dst_ybc) {
   int row;
-  unsigned char *source, *dest;
-
-
-  source = src_ybc->y_buffer;
-  dest = dst_ybc->y_buffer;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
 
-  for (row = 0; row < src_ybc->y_height; row++) {
-    vpx_memcpy(dest, source, src_ybc->y_width);
-    source += src_ybc->y_stride;
-    dest   += dst_ybc->y_stride;
+  for (row = 0; row < src_ybc->y_height; ++row) {
+    vpx_memcpy(dst, src, src_ybc->y_width);
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
   }
 }
diff --git a/libvpx/vpx_scale/vpx_scale_rtcd.sh b/libvpx/vpx_scale/vpx_scale_rtcd.sh
index 21d1e52..ea7b0e2 100644
--- a/libvpx/vpx_scale/vpx_scale_rtcd.sh
+++ b/libvpx/vpx_scale/vpx_scale_rtcd.sh
@@ -19,11 +19,11 @@ fi
 prototype void vp8_yv12_extend_frame_borders "struct yv12_buffer_config *ybf"
 specialize vp8_yv12_extend_frame_borders neon
 
-prototype void vp8_yv12_copy_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
+prototype void vp8_yv12_copy_frame "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
 specialize vp8_yv12_copy_frame neon
 
-prototype void vp8_yv12_copy_y "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
-specialize vp8_yv12_copy_y neon
+prototype void vpx_yv12_copy_y "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
+specialize vpx_yv12_copy_y neon
 
 if [ "$CONFIG_VP9" = "yes" ]; then
     prototype void vp9_extend_frame_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
diff --git a/libvpx/vpx_scale/yv12config.h b/libvpx/vpx_scale/yv12config.h
index 66e587a..0e950fb 100644
--- a/libvpx/vpx_scale/yv12config.h
+++ b/libvpx/vpx_scale/yv12config.h
@@ -18,7 +18,7 @@ extern "C" {
 #include "vpx/vpx_integer.h"
 
 #define VP8BORDERINPIXELS       32
-#define VP9INNERBORDERINPIXLES  96
+#define VP9INNERBORDERINPIXELS  96
 #define VP9BORDERINPIXELS      160
 #define VP9_INTERP_EXTEND        4
 
diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c
index 547b572..0c742ca 100644
--- a/libvpx/vpxenc.c
+++ b/libvpx/vpxenc.c
@@ -1046,6 +1046,7 @@ static const struct arg_enum_list end_usage_enum[] = {
   {"vbr", VPX_VBR},
   {"cbr", VPX_CBR},
   {"cq",  VPX_CQ},
+  {"q",   VPX_Q},
   {NULL, 0}
 };
 static const arg_def_t end_usage          = ARG_DEF_ENUM(NULL, "end-usage", 1,
@@ -1126,7 +1127,7 @@ static const struct arg_enum_list tuning_enum[] = {
 static const arg_def_t tune_ssim = ARG_DEF_ENUM(NULL, "tune", 1,
                                                 "Material to favor", tuning_enum);
 static const arg_def_t cq_level = ARG_DEF(NULL, "cq-level", 1,
-                                          "Constrained Quality Level");
+                                          "Constant/Constrained Quality level");
 static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1,
                                                     "Max I-frame bitrate (pct)");
 static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");
@@ -1688,8 +1689,10 @@ static void parse_global_config(struct global_config *global, char **argv) {
   /* Initialize default parameters */
   memset(global, 0, sizeof(*global));
   global->codec = codecs;
-  global->passes = 1;
+  global->passes = 0;
   global->use_i420 = 1;
+  /* Assign default deadline to good quality */
+  global->deadline = VPX_DL_GOOD_QUALITY;
 
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     arg.argv_step = 1;
@@ -1761,6 +1764,11 @@ static void parse_global_config(struct global_config *global, char **argv) {
   }
 
   /* Validate global config */
+  if (global->passes == 0) {
+    // Make default VP9 passes = 2 until there is a better quality 1-pass
+    // encoder
+    global->passes = (global->codec->iface == vpx_codec_vp9_cx ? 2 : 1);
+  }
 
   if (global->pass) {
     /* DWIM: Assume the user meant passes=2 if pass=2 is specified */
@@ -2631,8 +2639,8 @@ int main(int argc, const char **argv_) {
                                          &global.framerate));
     }
 
-    FOREACH_STREAM(open_output_file(stream, &global));
     FOREACH_STREAM(setup_pass(stream, &global, pass));
+    FOREACH_STREAM(open_output_file(stream, &global));
     FOREACH_STREAM(initialize_encoder(stream, &global));
 
     frame_avail = 1;
diff --git a/mips-dspr2/libvpx_srcs.txt b/mips-dspr2/libvpx_srcs.txt
index 299d615..897d207 100644
--- a/mips-dspr2/libvpx_srcs.txt
+++ b/mips-dspr2/libvpx_srcs.txt
@@ -178,6 +178,8 @@ vp9/common/vp9_reconintra.h
 vp9/common/vp9_rtcd.c
 vp9/common/vp9_rtcd_defs.sh
 vp9/common/vp9_sadmxn.h
+vp9/common/vp9_scale.c
+vp9/common/vp9_scale.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
 vp9/common/vp9_subpelvar.h
diff --git a/mips-dspr2/vp9_rtcd.h b/mips-dspr2/vp9_rtcd.h
index d6dc6bf..b23f1a6 100644
--- a/mips-dspr2/vp9_rtcd.h
+++ b/mips-dspr2/vp9_rtcd.h
@@ -36,160 +36,160 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
 void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
 #define vp9_idct_add_32x32 vp9_idct_add_32x32_c
 
-void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
+void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c
 
-void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
 
-void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
 
-void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
 
-void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
 
-void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
 
-void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
 
-void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
 
-void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
 
-void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
 
-void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
 
-void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
 
-void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
 
-void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c
 
-void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
 
-void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
 
-void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
 
-void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
 
-void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
 
-void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
 
-void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
 
-void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
 
-void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
 
-void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
 
-void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
 
-void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
 
-void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c
 
-void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
 
-void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
 
-void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
 
-void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
 
-void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
 
-void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
 
-void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
 
-void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
 
-void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
 
-void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
 
-void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
 
-void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
 
-void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c
 
-void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
 
-void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
 
-void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
 
-void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
 
-void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
 
-void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
 
-void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
 
-void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
 
-void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
 
-void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
 
-void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
 
-void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
 
 void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
@@ -300,12 +300,6 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride)
 void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_iwalsh4x4_add vp9_short_iwalsh4x4_add_c
 
-unsigned int vp9_sad32x3_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad);
-#define vp9_sad32x3 vp9_sad32x3_c
-
-unsigned int vp9_sad3x32_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad);
-#define vp9_sad3x32 vp9_sad3x32_c
-
 void vp9_rtcd(void);
 #include "vpx_config.h"
 
diff --git a/mips-dspr2/vpx_config.c b/mips-dspr2/vpx_config.c
index cf19239..1036456 100644
--- a/mips-dspr2/vpx_config.c
+++ b/mips-dspr2/vpx_config.c
@@ -5,5 +5,5 @@
 /* tree. An additional intellectual property rights grant can be found */
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=mips32-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/johannkoenig/android-ndk --disable-vp9-encoder --enable-dspr2 --disable-examples --disable-docs --enable-realtime-only";
+static const char* const cfg = "--force-target=mips32-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/hkuang/Downloads/android-ndk-r8e --disable-vp9-encoder --enable-dspr2 --disable-examples --disable-docs --enable-realtime-only";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/mips-dspr2/vpx_config.h b/mips-dspr2/vpx_config.h
index e85b676..e6cad01 100644
--- a/mips-dspr2/vpx_config.h
+++ b/mips-dspr2/vpx_config.h
@@ -59,6 +59,7 @@
 #define CONFIG_DC_RECON 1
 #define CONFIG_RUNTIME_CPU_DETECT 0
 #define CONFIG_POSTPROC 0
+#define CONFIG_VP9_POSTPROC 0
 #define CONFIG_MULTITHREAD 1
 #define CONFIG_INTERNAL_STATS 0
 #define CONFIG_VP8_ENCODER 1
diff --git a/mips-dspr2/vpx_scale_rtcd.h b/mips-dspr2/vpx_scale_rtcd.h
index be038f4..d9e41f3 100644
--- a/mips-dspr2/vpx_scale_rtcd.h
+++ b/mips-dspr2/vpx_scale_rtcd.h
@@ -33,11 +33,11 @@ void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pit
 void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
 
-void vp8_yv12_copy_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c
 
-void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_y vp8_yv12_copy_y_c
+void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
 void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
 #define vp9_extend_frame_borders vp9_extend_frame_borders_c
diff --git a/mips/libvpx_srcs.txt b/mips/libvpx_srcs.txt
index 055f5fb..8e6fad7 100644
--- a/mips/libvpx_srcs.txt
+++ b/mips/libvpx_srcs.txt
@@ -172,6 +172,8 @@ vp9/common/vp9_reconintra.h
 vp9/common/vp9_rtcd.c
 vp9/common/vp9_rtcd_defs.sh
 vp9/common/vp9_sadmxn.h
+vp9/common/vp9_scale.c
+vp9/common/vp9_scale.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
 vp9/common/vp9_subpelvar.h
diff --git a/mips/vp9_rtcd.h b/mips/vp9_rtcd.h
index d6dc6bf..b23f1a6 100644
--- a/mips/vp9_rtcd.h
+++ b/mips/vp9_rtcd.h
@@ -36,160 +36,160 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
 void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
 #define vp9_idct_add_32x32 vp9_idct_add_32x32_c
 
-void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
+void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c
 
-void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
 
-void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
 
-void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
 
-void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
 
-void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
 
-void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
 
-void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
 
-void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
 
-void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
 
-void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
 
-void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
 
-void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
 
-void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c
 
-void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
 
-void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
 
-void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
 
-void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
 
-void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
 
-void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
 
-void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
 
-void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
 
-void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
 
-void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
 
-void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
 
-void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
 
-void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c
 
-void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
 
-void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
 
-void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
 
-void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
 
-void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
 
-void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
 
-void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
 
-void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
 
-void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
 
-void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
 
-void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
 
-void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
 
-void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
-#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c
 
-void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
 
-void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
 
-void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
 
-void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
 
-void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
 
-void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
 
-void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
 
-void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
 
-void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
 
-void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
 
-void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
 
-void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
 
 void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
@@ -300,12 +300,6 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride)
 void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_iwalsh4x4_add vp9_short_iwalsh4x4_add_c
 
-unsigned int vp9_sad32x3_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad);
-#define vp9_sad32x3 vp9_sad32x3_c
-
-unsigned int vp9_sad3x32_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad);
-#define vp9_sad3x32 vp9_sad3x32_c
-
 void vp9_rtcd(void);
 #include "vpx_config.h"
 
diff --git a/mips/vpx_config.c b/mips/vpx_config.c
index 84f0e8b..8c995fb 100644
--- a/mips/vpx_config.c
+++ b/mips/vpx_config.c
@@ -5,5 +5,5 @@
 /* tree. An additional intellectual property rights grant can be found */
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=mips32-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/johannkoenig/android-ndk --disable-vp9-encoder --disable-examples --disable-docs --enable-realtime-only";
+static const char* const cfg = "--force-target=mips32-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/hkuang/Downloads/android-ndk-r8e --disable-vp9-encoder --disable-examples --disable-docs --enable-realtime-only";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/mips/vpx_config.h b/mips/vpx_config.h
index 7db47f8..8ead72e 100644
--- a/mips/vpx_config.h
+++ b/mips/vpx_config.h
@@ -59,6 +59,7 @@
 #define CONFIG_DC_RECON 1
 #define CONFIG_RUNTIME_CPU_DETECT 0
 #define CONFIG_POSTPROC 0
+#define CONFIG_VP9_POSTPROC 0
 #define CONFIG_MULTITHREAD 1
 #define CONFIG_INTERNAL_STATS 0
 #define CONFIG_VP8_ENCODER 1
diff --git a/mips/vpx_scale_rtcd.h b/mips/vpx_scale_rtcd.h
index be038f4..d9e41f3 100644
--- a/mips/vpx_scale_rtcd.h
+++ b/mips/vpx_scale_rtcd.h
@@ -33,11 +33,11 @@ void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pit
 void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
 
-void vp8_yv12_copy_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c
 
-void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_y vp8_yv12_copy_y_c
+void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
 void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
 #define vp9_extend_frame_borders vp9_extend_frame_borders_c
author	hkuang <hkuang@google.com>	2013-09-16 15:09:58 -0700
committer	Hangyu Kuang <hkuang@google.com>	2013-09-17 22:05:28 +0000
commit	1184aebb761cbeac9124c37189a80a1a58f04b6b (patch)
tree	b1ce6b3d29c43ffd22eb18999c5c3bad26513a48
parent	f3bed9137f66ef693bd406e43b17e9a1114f1e14 (diff)
download	android_external_libvpx-1184aebb761cbeac9124c37189a80a1a58f04b6b.tar.gz android_external_libvpx-1184aebb761cbeac9124c37189a80a1a58f04b6b.tar.bz2 android_external_libvpx-1184aebb761cbeac9124c37189a80a1a58f04b6b.zip