Merge "Update external/libvpx to 1.5.0 release"brillo-m10-release brillo-m10-dev

author: Bill Yi <byi@google.com> 2016-02-22 19:24:59 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> 2016-02-22 19:24:59 +0000
commit: 857bb8df092ee86783ab6933063a736929a07227 (patch)
tree: f42181486e87a18dba9945956209fae0366172cb
parent: 30dc5b6cbc88d67b24843b52c282e13f070b4ebc (diff)
parent: c927526be9a7b72fb5edb3f29c4e8ceabe0ec98a (diff)
download: platform_external_libvpx-brillo-m10-dev.tar.gz
platform_external_libvpx-brillo-m10-dev.tar.bz2
platform_external_libvpx-brillo-m10-dev.zip
246 files changed, 8362 insertions, 8108 deletions
diff --git a/README.android b/README.android
index 5949fc65..36d716d8 100644
--- a/README.android
+++ b/README.android
@@ -1,12 +1,12 @@
 Name: libvpx
 URL: http://www.webmproject.org
-Version: v1.4.0
+Version: v1.5.0
 License: BSD
 License File: libvpx/LICENSE
 
-Date: Tuesday August 25 2015
-Branch: origin/master
-Commit: 7105df53d7dc13d5e575bc8df714ec8d1da36b06
+Date: Thursday November 19 2015
+Branch: javanwhistlingduck
+Commit: cbecf57f3e0d85a7b7f97f3ab7c507f6fe640a93
 
 Description:
 Contains the sources used to compile libvpx.
diff --git a/README.version b/README.version
index f4d77c4e..48e6229d 100644
--- a/README.version
+++ b/README.version
@@ -1,4 +1,4 @@
-URL: https://chromium.googlesource.com/webm/libvpx/+archive/v1.4.0.tar.gz
-Version: 1.4.0
+URL: https://storage.googleapis.com/downloads.webmproject.org/releases/webm/libvpx-1.5.0.tar.bz2
+Version: 1.5.0
 BugComponent: 42195
 Owners: johannkoenig
diff --git a/config/arm-neon/libvpx_srcs.txt b/config/arm-neon/libvpx_srcs.txt
index 9d5084c3..bdeae071 100644
--- a/config/arm-neon/libvpx_srcs.txt
+++ b/config/arm-neon/libvpx_srcs.txt
@@ -14,7 +14,6 @@ vp8/common/arm/armv6/dequantize_v6.asm
 vp8/common/arm/armv6/filter_v6.asm
 vp8/common/arm/armv6/idct_blk_v6.c
 vp8/common/arm/armv6/idct_v6.asm
-vp8/common/arm/armv6/intra4x4_predict_v6.asm
 vp8/common/arm/armv6/iwalsh_v6.asm
 vp8/common/arm/armv6/loopfilter_v6.asm
 vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -36,7 +35,6 @@ vp8/common/arm/neon/iwalsh_neon.c
 vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
 vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
 vp8/common/arm/neon/mbloopfilter_neon.c
-vp8/common/arm/neon/reconintra_neon.c
 vp8/common/arm/neon/shortidct4x4llm_neon.c
 vp8/common/arm/neon/sixtappredict_neon.c
 vp8/common/arm/neon/vp8_loopfilter_neon.c
@@ -80,6 +78,7 @@ vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
 vp8/common/reconintra.c
+vp8/common/reconintra.h
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
 vp8/common/rtcd.c
@@ -298,6 +297,7 @@ vp9/encoder/vp9_treewriter.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
 vp9/vp9_dx_iface.c
+vp9/vp9_dx_iface.h
 vp9/vp9_iface_common.h
 vp9/vp9cx.mk
 vp9/vp9dx.mk
diff --git a/config/arm-neon/vp8_rtcd.h b/config/arm-neon/vp8_rtcd.h
index 0b836c4f..6fd2dac4 100644
--- a/config/arm-neon/vp8_rtcd.h
+++ b/config/arm-neon/vp8_rtcd.h
@@ -48,14 +48,6 @@ void vp8_bilinear_predict8x8_neon(unsigned char *src, int src_pitch, int xofst,
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_c
 
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-void vp8_build_intra_predictors_mbuv_s_neon(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_neon
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-void vp8_build_intra_predictors_mby_s_neon(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon
-
 void vp8_clear_system_state_c();
 #define vp8_clear_system_state vp8_clear_system_state_c
 
@@ -117,10 +109,6 @@ void vp8_fast_quantize_b_neon(struct block *, struct blockd *);
 int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_full_search_sad vp8_full_search_sad_c
 
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_armv6
-
 void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 void vp8_loop_filter_bh_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
diff --git a/config/arm-neon/vpx_config.asm b/config/arm-neon/vpx_config.asm
index 6f032662..5b623b8c 100644
--- a/config/arm-neon/vpx_config.asm
+++ b/config/arm-neon/vpx_config.asm
@@ -28,7 +28,7 @@
 .equ HAVE_UNISTD_H ,  1
 .equ CONFIG_DEPENDENCY_TRACKING ,  1
 .equ CONFIG_EXTERNAL_BUILD ,  1
-.equ CONFIG_INSTALL_DOCS ,  1
+.equ CONFIG_INSTALL_DOCS ,  0
 .equ CONFIG_INSTALL_BINS ,  1
 .equ CONFIG_INSTALL_LIBS ,  1
 .equ CONFIG_INSTALL_SRCS ,  0
@@ -86,4 +86,5 @@
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_FP_MB_STATS ,  0
 .equ CONFIG_EMULATE_HARDWARE ,  0
+.equ CONFIG_MISC_FIXES ,  0
 	.section	.note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_config.h b/config/arm-neon/vpx_config.h
index 8d02c250..d9d5f1ca 100644
--- a/config/arm-neon/vpx_config.h
+++ b/config/arm-neon/vpx_config.h
@@ -37,7 +37,7 @@
 #define HAVE_UNISTD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
 #define CONFIG_EXTERNAL_BUILD 1
-#define CONFIG_INSTALL_DOCS 1
+#define CONFIG_INSTALL_DOCS 0
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
@@ -95,4 +95,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_MISC_FIXES 0
 #endif /* VPX_CONFIG_H */
diff --git a/config/arm-neon/vpx_dsp_rtcd.h b/config/arm-neon/vpx_dsp_rtcd.h
index 4de075d7..ccb5df42 100644
--- a/config/arm-neon/vpx_dsp_rtcd.h
+++ b/config/arm-neon/vpx_dsp_rtcd.h
@@ -103,6 +103,18 @@ void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
 
+void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+
+void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+
+void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
+
+void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
+
 void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
@@ -118,6 +130,18 @@ void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_neon
 
+void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
+
+void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+
 void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
 
@@ -130,6 +154,21 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
 
+void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
+
+void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
+
+void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
+
 void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_neon
@@ -254,6 +293,9 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon
 
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
 void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
@@ -743,6 +785,9 @@ uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int sour
 uint32_t vpx_variance_halfpixvar16x16_v_media(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
 #define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_media
 
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
 void vpx_dsp_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/arm-neon/vpx_version.h b/config/arm-neon/vpx_version.h
index bce03815..3b6ea1e9 100644
--- a/config/arm-neon/vpx_version.h
+++ b/config/arm-neon/vpx_version.h
@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  4
+#define VERSION_MINOR  5
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.4.0"
-#define VERSION_STRING      " v1.4.0"
+#define VERSION_STRING_NOSP "v1.5.0"
+#define VERSION_STRING      " v1.5.0"
diff --git a/config/arm/libvpx_srcs.txt b/config/arm/libvpx_srcs.txt
index 53c4fda0..46a3c605 100644
--- a/config/arm/libvpx_srcs.txt
+++ b/config/arm/libvpx_srcs.txt
@@ -14,7 +14,6 @@ vp8/common/arm/armv6/dequantize_v6.asm
 vp8/common/arm/armv6/filter_v6.asm
 vp8/common/arm/armv6/idct_blk_v6.c
 vp8/common/arm/armv6/idct_v6.asm
-vp8/common/arm/armv6/intra4x4_predict_v6.asm
 vp8/common/arm/armv6/iwalsh_v6.asm
 vp8/common/arm/armv6/loopfilter_v6.asm
 vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -64,6 +63,7 @@ vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
 vp8/common/reconintra.c
+vp8/common/reconintra.h
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
 vp8/common/rtcd.c
@@ -272,6 +272,7 @@ vp9/encoder/vp9_treewriter.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
 vp9/vp9_dx_iface.c
+vp9/vp9_dx_iface.h
 vp9/vp9_iface_common.h
 vp9/vp9cx.mk
 vp9/vp9dx.mk
diff --git a/config/arm/vp8_rtcd.h b/config/arm/vp8_rtcd.h
index 7c2cefdd..f7287a52 100644
--- a/config/arm/vp8_rtcd.h
+++ b/config/arm/vp8_rtcd.h
@@ -45,12 +45,6 @@ void vp8_bilinear_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst,
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_c
 
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
-
 void vp8_clear_system_state_c();
 #define vp8_clear_system_state vp8_clear_system_state_c
 
@@ -101,10 +95,6 @@ void vp8_fast_quantize_b_c(struct block *, struct blockd *);
 int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_full_search_sad vp8_full_search_sad_c
 
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_armv6
-
 void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 void vp8_loop_filter_bh_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_armv6
diff --git a/config/arm/vpx_config.asm b/config/arm/vpx_config.asm
index 2a69621b..992fdee9 100644
--- a/config/arm/vpx_config.asm
+++ b/config/arm/vpx_config.asm
@@ -28,7 +28,7 @@
 .equ HAVE_UNISTD_H ,  1
 .equ CONFIG_DEPENDENCY_TRACKING ,  1
 .equ CONFIG_EXTERNAL_BUILD ,  1
-.equ CONFIG_INSTALL_DOCS ,  1
+.equ CONFIG_INSTALL_DOCS ,  0
 .equ CONFIG_INSTALL_BINS ,  1
 .equ CONFIG_INSTALL_LIBS ,  1
 .equ CONFIG_INSTALL_SRCS ,  0
@@ -86,4 +86,5 @@
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_FP_MB_STATS ,  0
 .equ CONFIG_EMULATE_HARDWARE ,  0
+.equ CONFIG_MISC_FIXES ,  0
 	.section	.note.GNU-stack,"",%progbits
diff --git a/config/arm/vpx_config.h b/config/arm/vpx_config.h
index 62b62859..d6d28094 100644
--- a/config/arm/vpx_config.h
+++ b/config/arm/vpx_config.h
@@ -37,7 +37,7 @@
 #define HAVE_UNISTD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
 #define CONFIG_EXTERNAL_BUILD 1
-#define CONFIG_INSTALL_DOCS 1
+#define CONFIG_INSTALL_DOCS 0
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
@@ -95,4 +95,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_MISC_FIXES 0
 #endif /* VPX_CONFIG_H */
diff --git a/config/arm/vpx_dsp_rtcd.h b/config/arm/vpx_dsp_rtcd.h
index bb570a02..ce2aeac1 100644
--- a/config/arm/vpx_dsp_rtcd.h
+++ b/config/arm/vpx_dsp_rtcd.h
@@ -94,6 +94,18 @@ void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
 
+void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+
+void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+
+void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
+
+void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
+
 void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
 
@@ -106,6 +118,18 @@ void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
 
+void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
+
+void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+
 void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
 
@@ -118,6 +142,21 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
 
+void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
+
+void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
+
+void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
+
 void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
 
@@ -217,6 +256,9 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
 
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
@@ -652,6 +694,9 @@ uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int sour
 uint32_t vpx_variance_halfpixvar16x16_v_media(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
 #define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_media
 
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
 void vpx_dsp_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/arm/vpx_version.h b/config/arm/vpx_version.h
index bce03815..3b6ea1e9 100644
--- a/config/arm/vpx_version.h
+++ b/config/arm/vpx_version.h
@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  4
+#define VERSION_MINOR  5
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.4.0"
-#define VERSION_STRING      " v1.4.0"
+#define VERSION_STRING_NOSP "v1.5.0"
+#define VERSION_STRING      " v1.5.0"
diff --git a/config/arm64/libvpx_srcs.txt b/config/arm64/libvpx_srcs.txt
index 483ffbb5..97705603 100644
--- a/config/arm64/libvpx_srcs.txt
+++ b/config/arm64/libvpx_srcs.txt
@@ -19,7 +19,6 @@ vp8/common/arm/neon/iwalsh_neon.c
 vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
 vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
 vp8/common/arm/neon/mbloopfilter_neon.c
-vp8/common/arm/neon/reconintra_neon.c
 vp8/common/arm/neon/shortidct4x4llm_neon.c
 vp8/common/arm/neon/sixtappredict_neon.c
 vp8/common/arm/neon/vp8_loopfilter_neon.c
@@ -63,6 +62,7 @@ vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
 vp8/common/reconintra.c
+vp8/common/reconintra.h
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
 vp8/common/rtcd.c
@@ -279,6 +279,7 @@ vp9/encoder/vp9_treewriter.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
 vp9/vp9_dx_iface.c
+vp9/vp9_dx_iface.h
 vp9/vp9_iface_common.h
 vp9/vp9cx.mk
 vp9/vp9dx.mk
diff --git a/config/arm64/vp8_rtcd.h b/config/arm64/vp8_rtcd.h
index 1f376294..5ab06f46 100644
--- a/config/arm64/vp8_rtcd.h
+++ b/config/arm64/vp8_rtcd.h
@@ -44,14 +44,6 @@ void vp8_bilinear_predict8x8_neon(unsigned char *src, int src_pitch, int xofst,
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_c
 
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-void vp8_build_intra_predictors_mbuv_s_neon(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_neon
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-void vp8_build_intra_predictors_mby_s_neon(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon
-
 void vp8_clear_system_state_c();
 #define vp8_clear_system_state vp8_clear_system_state_c
 
@@ -105,9 +97,6 @@ void vp8_fast_quantize_b_neon(struct block *, struct blockd *);
 int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_full_search_sad vp8_full_search_sad_c
 
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_c
-
 void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_neon
diff --git a/config/arm64/vpx_config.asm b/config/arm64/vpx_config.asm
index b6c1a52e..d7d6652e 100644
--- a/config/arm64/vpx_config.asm
+++ b/config/arm64/vpx_config.asm
@@ -28,7 +28,7 @@
 .equ HAVE_UNISTD_H ,  1
 .equ CONFIG_DEPENDENCY_TRACKING ,  1
 .equ CONFIG_EXTERNAL_BUILD ,  1
-.equ CONFIG_INSTALL_DOCS ,  1
+.equ CONFIG_INSTALL_DOCS ,  0
 .equ CONFIG_INSTALL_BINS ,  1
 .equ CONFIG_INSTALL_LIBS ,  1
 .equ CONFIG_INSTALL_SRCS ,  0
@@ -86,4 +86,5 @@
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_FP_MB_STATS ,  0
 .equ CONFIG_EMULATE_HARDWARE ,  0
+.equ CONFIG_MISC_FIXES ,  0
 	.section	.note.GNU-stack,"",%progbits
diff --git a/config/arm64/vpx_config.h b/config/arm64/vpx_config.h
index fb0eabc0..981aa3e9 100644
--- a/config/arm64/vpx_config.h
+++ b/config/arm64/vpx_config.h
@@ -37,7 +37,7 @@
 #define HAVE_UNISTD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
 #define CONFIG_EXTERNAL_BUILD 1
-#define CONFIG_INSTALL_DOCS 1
+#define CONFIG_INSTALL_DOCS 0
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
@@ -95,4 +95,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_MISC_FIXES 0
 #endif /* VPX_CONFIG_H */
diff --git a/config/arm64/vpx_dsp_rtcd.h b/config/arm64/vpx_dsp_rtcd.h
index 2cac9e66..e5fa148c 100644
--- a/config/arm64/vpx_dsp_rtcd.h
+++ b/config/arm64/vpx_dsp_rtcd.h
@@ -103,6 +103,18 @@ void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
 
+void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+
+void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+
+void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
+
+void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
+
 void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
@@ -118,6 +130,18 @@ void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_neon
 
+void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
+
+void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+
 void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
 
@@ -130,6 +154,21 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
 
+void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
+
+void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
+
+void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
+
 void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_neon
@@ -254,6 +293,9 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon
 
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
 void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
@@ -728,6 +770,9 @@ uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int sou
 uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
 #define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_c
 
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
 void vpx_dsp_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/arm64/vpx_version.h b/config/arm64/vpx_version.h
index bce03815..3b6ea1e9 100644
--- a/config/arm64/vpx_version.h
+++ b/config/arm64/vpx_version.h
@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  4
+#define VERSION_MINOR  5
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.4.0"
-#define VERSION_STRING      " v1.4.0"
+#define VERSION_STRING_NOSP "v1.5.0"
+#define VERSION_STRING      " v1.5.0"
diff --git a/config/generic/libvpx_srcs.txt b/config/generic/libvpx_srcs.txt
index f6e76f01..212026b4 100644
--- a/config/generic/libvpx_srcs.txt
+++ b/config/generic/libvpx_srcs.txt
@@ -44,6 +44,7 @@ vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
 vp8/common/reconintra.c
+vp8/common/reconintra.h
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
 vp8/common/rtcd.c
@@ -248,6 +249,7 @@ vp9/encoder/vp9_treewriter.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
 vp9/vp9_dx_iface.c
+vp9/vp9_dx_iface.h
 vp9/vp9_iface_common.h
 vp9/vp9cx.mk
 vp9/vp9dx.mk
diff --git a/config/generic/vp8_rtcd.h b/config/generic/vp8_rtcd.h
index f5424bbc..bad72a3e 100644
--- a/config/generic/vp8_rtcd.h
+++ b/config/generic/vp8_rtcd.h
@@ -41,12 +41,6 @@ void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_c
 
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
-
 void vp8_clear_system_state_c();
 #define vp8_clear_system_state vp8_clear_system_state_c
 
@@ -89,9 +83,6 @@ void vp8_fast_quantize_b_c(struct block *, struct blockd *);
 int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_full_search_sad vp8_full_search_sad_c
 
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_c
-
 void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_c
 
diff --git a/config/generic/vpx_config.asm b/config/generic/vpx_config.asm
index b684cd24..c3530a2b 100644
--- a/config/generic/vpx_config.asm
+++ b/config/generic/vpx_config.asm
@@ -28,7 +28,7 @@
 .equ HAVE_UNISTD_H ,  1
 .equ CONFIG_DEPENDENCY_TRACKING ,  1
 .equ CONFIG_EXTERNAL_BUILD ,  1
-.equ CONFIG_INSTALL_DOCS ,  1
+.equ CONFIG_INSTALL_DOCS ,  0
 .equ CONFIG_INSTALL_BINS ,  1
 .equ CONFIG_INSTALL_LIBS ,  1
 .equ CONFIG_INSTALL_SRCS ,  0
@@ -86,4 +86,5 @@
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_FP_MB_STATS ,  0
 .equ CONFIG_EMULATE_HARDWARE ,  0
+.equ CONFIG_MISC_FIXES ,  0
 	.section	.note.GNU-stack,"",%progbits
diff --git a/config/generic/vpx_config.h b/config/generic/vpx_config.h
index 9cdca1fd..50da7042 100644
--- a/config/generic/vpx_config.h
+++ b/config/generic/vpx_config.h
@@ -37,7 +37,7 @@
 #define HAVE_UNISTD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
 #define CONFIG_EXTERNAL_BUILD 1
-#define CONFIG_INSTALL_DOCS 1
+#define CONFIG_INSTALL_DOCS 0
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
@@ -95,4 +95,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_MISC_FIXES 0
 #endif /* VPX_CONFIG_H */
diff --git a/config/generic/vpx_dsp_rtcd.h b/config/generic/vpx_dsp_rtcd.h
index 010cbe78..f4929eec 100644
--- a/config/generic/vpx_dsp_rtcd.h
+++ b/config/generic/vpx_dsp_rtcd.h
@@ -94,6 +94,18 @@ void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
 
+void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+
+void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+
+void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
+
+void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
+
 void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
 
@@ -106,6 +118,18 @@ void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
 
+void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
+
+void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+
 void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
 
@@ -118,6 +142,21 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
 
+void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
+
+void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
+
+void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
+
 void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
 
@@ -217,6 +256,9 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
 
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
@@ -643,6 +685,9 @@ uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int sou
 uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
 #define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_c
 
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
 void vpx_dsp_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/generic/vpx_version.h b/config/generic/vpx_version.h
index bce03815..3b6ea1e9 100644
--- a/config/generic/vpx_version.h
+++ b/config/generic/vpx_version.h
@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  4
+#define VERSION_MINOR  5
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.4.0"
-#define VERSION_STRING      " v1.4.0"
+#define VERSION_STRING_NOSP "v1.5.0"
+#define VERSION_STRING      " v1.5.0"
diff --git a/config/mips32-dspr2/libvpx_srcs.txt b/config/mips32-dspr2/libvpx_srcs.txt
index 9ea5edef..452c0a38 100644
--- a/config/mips32-dspr2/libvpx_srcs.txt
+++ b/config/mips32-dspr2/libvpx_srcs.txt
@@ -50,6 +50,7 @@ vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
 vp8/common/reconintra.c
+vp8/common/reconintra.h
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
 vp8/common/rtcd.c
@@ -257,6 +258,7 @@ vp9/encoder/vp9_treewriter.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
 vp9/vp9_dx_iface.c
+vp9/vp9_dx_iface.h
 vp9/vp9_iface_common.h
 vp9/vp9cx.mk
 vp9/vp9dx.mk
diff --git a/config/mips32-dspr2/vp8_rtcd.h b/config/mips32-dspr2/vp8_rtcd.h
index 4442f6ae..03d3f0c8 100644
--- a/config/mips32-dspr2/vp8_rtcd.h
+++ b/config/mips32-dspr2/vp8_rtcd.h
@@ -41,12 +41,6 @@ void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_c
 
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
-
 void vp8_clear_system_state_c();
 #define vp8_clear_system_state vp8_clear_system_state_c
 
@@ -96,9 +90,6 @@ void vp8_fast_quantize_b_c(struct block *, struct blockd *);
 int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_full_search_sad vp8_full_search_sad_c
 
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_c
-
 void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 void vp8_loop_filter_bh_dspr2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_dspr2
diff --git a/config/mips32-dspr2/vpx_config.h b/config/mips32-dspr2/vpx_config.h
index f0a0556e..4e8961cf 100644
--- a/config/mips32-dspr2/vpx_config.h
+++ b/config/mips32-dspr2/vpx_config.h
@@ -37,7 +37,7 @@
 #define HAVE_UNISTD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
 #define CONFIG_EXTERNAL_BUILD 1
-#define CONFIG_INSTALL_DOCS 1
+#define CONFIG_INSTALL_DOCS 0
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
@@ -95,4 +95,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_MISC_FIXES 0
 #endif /* VPX_CONFIG_H */
diff --git a/config/mips32-dspr2/vpx_dsp_rtcd.h b/config/mips32-dspr2/vpx_dsp_rtcd.h
index b716181f..7acb8072 100644
--- a/config/mips32-dspr2/vpx_dsp_rtcd.h
+++ b/config/mips32-dspr2/vpx_dsp_rtcd.h
@@ -102,6 +102,18 @@ void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
 
+void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+
+void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+
+void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
+
+void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
+
 void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
 
@@ -114,6 +126,18 @@ void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
 
+void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
+
+void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+
 void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
 
@@ -126,6 +150,21 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
 
+void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
+
+void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
+
+void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
+
 void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
 
@@ -231,6 +270,9 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_dspr2
 
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
 void vpx_idct16x16_10_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_dspr2
@@ -681,6 +723,9 @@ uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int sou
 uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
 #define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_c
 
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
 void vpx_dsp_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/mips32-dspr2/vpx_version.h b/config/mips32-dspr2/vpx_version.h
index bce03815..3b6ea1e9 100644
--- a/config/mips32-dspr2/vpx_version.h
+++ b/config/mips32-dspr2/vpx_version.h
@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  4
+#define VERSION_MINOR  5
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.4.0"
-#define VERSION_STRING      " v1.4.0"
+#define VERSION_STRING_NOSP "v1.5.0"
+#define VERSION_STRING      " v1.5.0"
diff --git a/config/mips32/libvpx_srcs.txt b/config/mips32/libvpx_srcs.txt
index f6e76f01..212026b4 100644
--- a/config/mips32/libvpx_srcs.txt
+++ b/config/mips32/libvpx_srcs.txt
@@ -44,6 +44,7 @@ vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
 vp8/common/reconintra.c
+vp8/common/reconintra.h
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
 vp8/common/rtcd.c
@@ -248,6 +249,7 @@ vp9/encoder/vp9_treewriter.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
 vp9/vp9_dx_iface.c
+vp9/vp9_dx_iface.h
 vp9/vp9_iface_common.h
 vp9/vp9cx.mk
 vp9/vp9dx.mk
diff --git a/config/mips32/vp8_rtcd.h b/config/mips32/vp8_rtcd.h
index 28e23b31..791c1552 100644
--- a/config/mips32/vp8_rtcd.h
+++ b/config/mips32/vp8_rtcd.h
@@ -41,12 +41,6 @@ void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_c
 
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
-
 void vp8_clear_system_state_c();
 #define vp8_clear_system_state vp8_clear_system_state_c
 
@@ -89,9 +83,6 @@ void vp8_fast_quantize_b_c(struct block *, struct blockd *);
 int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_full_search_sad vp8_full_search_sad_c
 
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_c
-
 void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_c
 
diff --git a/config/mips32/vpx_config.h b/config/mips32/vpx_config.h
index 1bc7afa7..82c9cf52 100644
--- a/config/mips32/vpx_config.h
+++ b/config/mips32/vpx_config.h
@@ -37,7 +37,7 @@
 #define HAVE_UNISTD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
 #define CONFIG_EXTERNAL_BUILD 1
-#define CONFIG_INSTALL_DOCS 1
+#define CONFIG_INSTALL_DOCS 0
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
@@ -95,4 +95,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_MISC_FIXES 0
 #endif /* VPX_CONFIG_H */
diff --git a/config/mips32/vpx_dsp_rtcd.h b/config/mips32/vpx_dsp_rtcd.h
index cff36af5..2d2bec21 100644
--- a/config/mips32/vpx_dsp_rtcd.h
+++ b/config/mips32/vpx_dsp_rtcd.h
@@ -94,6 +94,18 @@ void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
 
+void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+
+void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+
+void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
+
+void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
+
 void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
 
@@ -106,6 +118,18 @@ void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
 
+void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
+
+void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+
 void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
 
@@ -118,6 +142,21 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
 
+void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
+
+void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
+
+void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
+
 void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
 
@@ -217,6 +256,9 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
 
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
@@ -643,6 +685,9 @@ uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int sou
 uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
 #define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_c
 
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
 void vpx_dsp_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/mips32/vpx_version.h b/config/mips32/vpx_version.h
index bce03815..3b6ea1e9 100644
--- a/config/mips32/vpx_version.h
+++ b/config/mips32/vpx_version.h
@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  4
+#define VERSION_MINOR  5
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.4.0"
-#define VERSION_STRING      " v1.4.0"
+#define VERSION_STRING_NOSP "v1.5.0"
+#define VERSION_STRING      " v1.5.0"
diff --git a/config/mips64/libvpx_srcs.txt b/config/mips64/libvpx_srcs.txt
index f6e76f01..212026b4 100644
--- a/config/mips64/libvpx_srcs.txt
+++ b/config/mips64/libvpx_srcs.txt
@@ -44,6 +44,7 @@ vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
 vp8/common/reconintra.c
+vp8/common/reconintra.h
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
 vp8/common/rtcd.c
@@ -248,6 +249,7 @@ vp9/encoder/vp9_treewriter.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
 vp9/vp9_dx_iface.c
+vp9/vp9_dx_iface.h
 vp9/vp9_iface_common.h
 vp9/vp9cx.mk
 vp9/vp9dx.mk
diff --git a/config/mips64/vp8_rtcd.h b/config/mips64/vp8_rtcd.h
index 28e23b31..791c1552 100644
--- a/config/mips64/vp8_rtcd.h
+++ b/config/mips64/vp8_rtcd.h
@@ -41,12 +41,6 @@ void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_c
 
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
-
 void vp8_clear_system_state_c();
 #define vp8_clear_system_state vp8_clear_system_state_c
 
@@ -89,9 +83,6 @@ void vp8_fast_quantize_b_c(struct block *, struct blockd *);
 int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_full_search_sad vp8_full_search_sad_c
 
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_c
-
 void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_c
 
diff --git a/config/mips64/vpx_config.h b/config/mips64/vpx_config.h
index f19731bc..b6cc04bc 100644
--- a/config/mips64/vpx_config.h
+++ b/config/mips64/vpx_config.h
@@ -37,7 +37,7 @@
 #define HAVE_UNISTD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
 #define CONFIG_EXTERNAL_BUILD 1
-#define CONFIG_INSTALL_DOCS 1
+#define CONFIG_INSTALL_DOCS 0
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
@@ -95,4 +95,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_MISC_FIXES 0
 #endif /* VPX_CONFIG_H */
diff --git a/config/mips64/vpx_dsp_rtcd.h b/config/mips64/vpx_dsp_rtcd.h
index cff36af5..2d2bec21 100644
--- a/config/mips64/vpx_dsp_rtcd.h
+++ b/config/mips64/vpx_dsp_rtcd.h
@@ -94,6 +94,18 @@ void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
 
+void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+
+void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+
+void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
+
+void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
+
 void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
 
@@ -106,6 +118,18 @@ void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
 
+void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
+
+void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+
 void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
 
@@ -118,6 +142,21 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
 
+void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
+
+void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
+
+void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
+
 void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
 
@@ -217,6 +256,9 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
 
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
@@ -643,6 +685,9 @@ uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int sou
 uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
 #define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_c
 
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
 void vpx_dsp_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/mips64/vpx_version.h b/config/mips64/vpx_version.h
index bce03815..3b6ea1e9 100644
--- a/config/mips64/vpx_version.h
+++ b/config/mips64/vpx_version.h
@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  4
+#define VERSION_MINOR  5
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.4.0"
-#define VERSION_STRING      " v1.4.0"
+#define VERSION_STRING_NOSP "v1.5.0"
+#define VERSION_STRING      " v1.5.0"
diff --git a/config/x86/libvpx_srcs.txt b/config/x86/libvpx_srcs.txt
index af63fd8b..88150165 100644
--- a/config/x86/libvpx_srcs.txt
+++ b/config/x86/libvpx_srcs.txt
@@ -47,6 +47,7 @@ vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
 vp8/common/reconintra.c
+vp8/common/reconintra.h
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
 vp8/common/rtcd.c
@@ -79,7 +80,6 @@ vp8/common/x86/postproc_mmx.asm
 vp8/common/x86/postproc_sse2.asm
 vp8/common/x86/recon_mmx.asm
 vp8/common/x86/recon_sse2.asm
-vp8/common/x86/recon_wrapper_sse2.c
 vp8/common/x86/subpixel_mmx.asm
 vp8/common/x86/subpixel_sse2.asm
 vp8/common/x86/subpixel_ssse3.asm
@@ -293,6 +293,7 @@ vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
 vp9/vp9_dx_iface.c
+vp9/vp9_dx_iface.h
 vp9/vp9_iface_common.h
 vp9/vp9cx.mk
 vp9/vp9dx.mk
@@ -349,6 +350,8 @@ vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
 vpx_dsp/x86/fwd_txfm_impl_sse2.h
 vpx_dsp/x86/fwd_txfm_sse2.c
 vpx_dsp/x86/fwd_txfm_sse2.h
+vpx_dsp/x86/halfpix_variance_impl_sse2.asm
+vpx_dsp/x86/halfpix_variance_sse2.c
 vpx_dsp/x86/intrapred_sse2.asm
 vpx_dsp/x86/intrapred_ssse3.asm
 vpx_dsp/x86/inv_txfm_sse2.c
diff --git a/config/x86/vp8_rtcd.h b/config/x86/vp8_rtcd.h
index fc714f41..c4c70452 100644
--- a/config/x86/vp8_rtcd.h
+++ b/config/x86/vp8_rtcd.h
@@ -60,16 +60,6 @@ int vp8_block_error_mmx(short *coeff, short *dqcoeff);
 int vp8_block_error_xmm(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_xmm
 
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_ssse3
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_ssse3
-
 void vp8_clear_system_state_c();
 void vpx_reset_mmx_state();
 #define vp8_clear_system_state vpx_reset_mmx_state
@@ -146,9 +136,6 @@ int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *
 int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_full_search_sad vp8_full_search_sadx3
 
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_c
-
 void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
diff --git a/config/x86/vpx_config.asm b/config/x86/vpx_config.asm
index 2b7f1ccf..b6557bb8 100644
--- a/config/x86/vpx_config.asm
+++ b/config/x86/vpx_config.asm
@@ -25,7 +25,7 @@
 %define HAVE_UNISTD_H 1
 %define CONFIG_DEPENDENCY_TRACKING 1
 %define CONFIG_EXTERNAL_BUILD 1
-%define CONFIG_INSTALL_DOCS 1
+%define CONFIG_INSTALL_DOCS 0
 %define CONFIG_INSTALL_BINS 1
 %define CONFIG_INSTALL_LIBS 1
 %define CONFIG_INSTALL_SRCS 0
@@ -83,3 +83,4 @@
 %define CONFIG_SPATIAL_SVC 0
 %define CONFIG_FP_MB_STATS 0
 %define CONFIG_EMULATE_HARDWARE 0
+%define CONFIG_MISC_FIXES 0
diff --git a/config/x86/vpx_config.h b/config/x86/vpx_config.h
index 634c67ba..a5168579 100644
--- a/config/x86/vpx_config.h
+++ b/config/x86/vpx_config.h
@@ -37,7 +37,7 @@
 #define HAVE_UNISTD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
 #define CONFIG_EXTERNAL_BUILD 1
-#define CONFIG_INSTALL_DOCS 1
+#define CONFIG_INSTALL_DOCS 0
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
@@ -95,4 +95,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_MISC_FIXES 0
 #endif /* VPX_CONFIG_H */
diff --git a/config/x86/vpx_dsp_rtcd.h b/config/x86/vpx_dsp_rtcd.h
index 64ee53fa..af7917a2 100644
--- a/config/x86/vpx_dsp_rtcd.h
+++ b/config/x86/vpx_dsp_rtcd.h
@@ -116,6 +116,18 @@ void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_ssse3
 
+void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+
+void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+
+void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
+
+void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
+
 void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_ssse3
@@ -132,6 +144,18 @@ void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_ssse3
 
+void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
+
+void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+
 void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_ssse3
@@ -148,6 +172,21 @@ void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_ssse3
 
+void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
+
+void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
+
+void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
+
 void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2
@@ -281,6 +320,9 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_ssse3
 
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
@@ -864,15 +906,21 @@ unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, con
 
 uint32_t vpx_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
 uint32_t vpx_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
-#define vpx_variance_halfpixvar16x16_h vpx_variance_halfpixvar16x16_h_mmx
+uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_h vpx_variance_halfpixvar16x16_h_sse2
 
 uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
 uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
-#define vpx_variance_halfpixvar16x16_hv vpx_variance_halfpixvar16x16_hv_mmx
+uint32_t vpx_variance_halfpixvar16x16_hv_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_hv vpx_variance_halfpixvar16x16_hv_sse2
 
 uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
 uint32_t vpx_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
-#define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_mmx
+uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_sse2
+
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
 
 void vpx_dsp_rtcd(void);
 
diff --git a/config/x86/vpx_version.h b/config/x86/vpx_version.h
index bce03815..3b6ea1e9 100644
--- a/config/x86/vpx_version.h
+++ b/config/x86/vpx_version.h
@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  4
+#define VERSION_MINOR  5
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.4.0"
-#define VERSION_STRING      " v1.4.0"
+#define VERSION_STRING_NOSP "v1.5.0"
+#define VERSION_STRING      " v1.5.0"
diff --git a/config/x86_64/libvpx_srcs.txt b/config/x86_64/libvpx_srcs.txt
index ac3de522..3794e3ba 100644
--- a/config/x86_64/libvpx_srcs.txt
+++ b/config/x86_64/libvpx_srcs.txt
@@ -47,6 +47,7 @@ vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
 vp8/common/reconintra.c
+vp8/common/reconintra.h
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
 vp8/common/rtcd.c
@@ -80,7 +81,6 @@ vp8/common/x86/postproc_mmx.asm
 vp8/common/x86/postproc_sse2.asm
 vp8/common/x86/recon_mmx.asm
 vp8/common/x86/recon_sse2.asm
-vp8/common/x86/recon_wrapper_sse2.c
 vp8/common/x86/subpixel_mmx.asm
 vp8/common/x86/subpixel_sse2.asm
 vp8/common/x86/subpixel_ssse3.asm
@@ -296,6 +296,7 @@ vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
 vp9/vp9_dx_iface.c
+vp9/vp9_dx_iface.h
 vp9/vp9_iface_common.h
 vp9/vp9cx.mk
 vp9/vp9dx.mk
@@ -353,6 +354,8 @@ vpx_dsp/x86/fwd_txfm_impl_sse2.h
 vpx_dsp/x86/fwd_txfm_sse2.c
 vpx_dsp/x86/fwd_txfm_sse2.h
 vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+vpx_dsp/x86/halfpix_variance_impl_sse2.asm
+vpx_dsp/x86/halfpix_variance_sse2.c
 vpx_dsp/x86/intrapred_sse2.asm
 vpx_dsp/x86/intrapred_ssse3.asm
 vpx_dsp/x86/inv_txfm_sse2.c
diff --git a/config/x86_64/vp8_rtcd.h b/config/x86_64/vp8_rtcd.h
index fc714f41..c4c70452 100644
--- a/config/x86_64/vp8_rtcd.h
+++ b/config/x86_64/vp8_rtcd.h
@@ -60,16 +60,6 @@ int vp8_block_error_mmx(short *coeff, short *dqcoeff);
 int vp8_block_error_xmm(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_xmm
 
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_ssse3
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_ssse3
-
 void vp8_clear_system_state_c();
 void vpx_reset_mmx_state();
 #define vp8_clear_system_state vpx_reset_mmx_state
@@ -146,9 +136,6 @@ int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *
 int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_full_search_sad vp8_full_search_sadx3
 
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_c
-
 void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
 void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
diff --git a/config/x86_64/vpx_config.asm b/config/x86_64/vpx_config.asm
index 6f0800b4..774d73fb 100644
--- a/config/x86_64/vpx_config.asm
+++ b/config/x86_64/vpx_config.asm
@@ -25,7 +25,7 @@
 %define HAVE_UNISTD_H 1
 %define CONFIG_DEPENDENCY_TRACKING 1
 %define CONFIG_EXTERNAL_BUILD 1
-%define CONFIG_INSTALL_DOCS 1
+%define CONFIG_INSTALL_DOCS 0
 %define CONFIG_INSTALL_BINS 1
 %define CONFIG_INSTALL_LIBS 1
 %define CONFIG_INSTALL_SRCS 0
@@ -83,3 +83,4 @@
 %define CONFIG_SPATIAL_SVC 0
 %define CONFIG_FP_MB_STATS 0
 %define CONFIG_EMULATE_HARDWARE 0
+%define CONFIG_MISC_FIXES 0
diff --git a/config/x86_64/vpx_config.h b/config/x86_64/vpx_config.h
index 8796347b..9278f1e8 100644
--- a/config/x86_64/vpx_config.h
+++ b/config/x86_64/vpx_config.h
@@ -37,7 +37,7 @@
 #define HAVE_UNISTD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
 #define CONFIG_EXTERNAL_BUILD 1
-#define CONFIG_INSTALL_DOCS 1
+#define CONFIG_INSTALL_DOCS 0
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
@@ -95,4 +95,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_MISC_FIXES 0
 #endif /* VPX_CONFIG_H */
diff --git a/config/x86_64/vpx_dsp_rtcd.h b/config/x86_64/vpx_dsp_rtcd.h
index e78d8ef6..73962338 100644
--- a/config/x86_64/vpx_dsp_rtcd.h
+++ b/config/x86_64/vpx_dsp_rtcd.h
@@ -116,6 +116,18 @@ void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_ssse3
 
+void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+
+void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+
+void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
+
+void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
+
 void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_ssse3
@@ -132,6 +144,18 @@ void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_ssse3
 
+void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
+
+void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+
 void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_ssse3
@@ -148,6 +172,21 @@ void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_ssse3
 
+void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
+
+void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
+
+void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
+
 void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2
@@ -282,6 +321,9 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_ssse3
 
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
@@ -870,15 +912,21 @@ unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, con
 
 uint32_t vpx_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
 uint32_t vpx_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
-#define vpx_variance_halfpixvar16x16_h vpx_variance_halfpixvar16x16_h_mmx
+uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_h vpx_variance_halfpixvar16x16_h_sse2
 
 uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
 uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
-#define vpx_variance_halfpixvar16x16_hv vpx_variance_halfpixvar16x16_hv_mmx
+uint32_t vpx_variance_halfpixvar16x16_hv_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_hv vpx_variance_halfpixvar16x16_hv_sse2
 
 uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
 uint32_t vpx_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
-#define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_mmx
+uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_sse2
+
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
 
 void vpx_dsp_rtcd(void);
 
diff --git a/config/x86_64/vpx_version.h b/config/x86_64/vpx_version.h
index bce03815..3b6ea1e9 100644
--- a/config/x86_64/vpx_version.h
+++ b/config/x86_64/vpx_version.h
@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  4
+#define VERSION_MINOR  5
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.4.0"
-#define VERSION_STRING      " v1.4.0"
+#define VERSION_STRING_NOSP "v1.5.0"
+#define VERSION_STRING      " v1.5.0"
diff --git a/libvpx/.mailmap b/libvpx/.mailmap
index 0bfda120..42f3617b 100644
--- a/libvpx/.mailmap
+++ b/libvpx/.mailmap
@@ -1,14 +1,21 @@
 Adrian Grange <agrange@google.com>
-Alex Converse <aconverse@google.com> <alex.converse@gmail.com>
+Adrian Grange <agrange@google.com> <agrange@agrange-macbookpro.roam.corp.google.com>
+Aℓex Converse <aconverse@google.com>
+Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
 Deb Mukherjee <debargha@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
 Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
+Hangyu Kuang <hkuang@google.com> <hkuang@hkuang-macbookpro.roam.corp.google.com>
+Hui Su <huisu@google.com>
+Jacky Chen <jackychen@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
+Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
 Marco Paniconi <marpan@google.com>
@@ -17,10 +24,13 @@ Pascal Massimino <pascal.massimino@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
+Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
 Sami Pietilä <samipietila@google.com>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
+Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
+Yaowu Xu <yaowu@google.com> <yaowu@YAOWU2-W.ad.corp.google.com>
diff --git a/libvpx/AUTHORS b/libvpx/AUTHORS
index 2f63d7c5..f89b6776 100644
--- a/libvpx/AUTHORS
+++ b/libvpx/AUTHORS
@@ -5,9 +5,9 @@ Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
 Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
+Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
-Alex Converse <aconverse@google.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -16,8 +16,10 @@ Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Andrew Russell <anrussell@google.com>
+Angie Chiang <angiebird@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
+Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
 chm <chm@rock-chips.com>
@@ -27,6 +29,7 @@ Deb Mukherjee <debargha@google.com>
 Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
 Dragan Mrdjan <dmrdjan@mips.com>
+Ed Baker <edward.baker@intel.com>
 Ehsan Akhgari <ehsan.akhgari@gmail.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com>
 Fabio Pedretti <fabio.ped@libero.it>
@@ -34,6 +37,8 @@ Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
+Geza Lore <gezalore@gmail.com>
+Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
 Guillaume Martres <gmartres@google.com>
@@ -44,7 +49,7 @@ Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
 Ivan Maltz <ivanmaltz@google.com>
 Jacek Caban <cjacek@gmail.com>
-JackyChen <jackychen@google.com>
+Jacky Chen <jackychen@google.com>
 James Berry <jamesberry@google.com>
 James Yu <james.yu@linaro.org>
 James Zern <jzern@google.com>
@@ -60,9 +65,11 @@ Jingning Han <jingning@google.com>
 Joey Parrish <joeyparrish@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
+Johnny Klonaris <google@jawknee.com>
 John Stark <jhnstrk@gmail.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Joshua Litt <joshualitt@google.com>
+Julia Robson <juliamrobson@gmail.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 KO Myung-Hun <komh@chollian.net>
@@ -82,6 +89,7 @@ Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
 Minghai Shang <minghai@google.com>
 Morton Jonuschat <yabawock@gmail.com>
+Nico Weber <thakis@chromium.org>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
 Patrik Westin <patrik.westin@gmail.com>
@@ -96,7 +104,7 @@ Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
 Ralph Giles <giles@xiph.org>
 Rob Bradford <rob@linux.intel.com>
-Ronald S. Bultje <rbultje@google.com>
+Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
 Sami Pietilä <samipietila@google.com>
 Scott Graham <scottmg@chromium.org>
@@ -104,6 +112,7 @@ Scott LaVarnway <slavarnway@google.com>
 Sean McGovern <gseanmcg@gmail.com>
 Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
+Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
 Taekhyun Kim <takim@nvidia.com>
diff --git a/libvpx/CHANGELOG b/libvpx/CHANGELOG
index b0d30644..7746cc6c 100644
--- a/libvpx/CHANGELOG
+++ b/libvpx/CHANGELOG
@@ -1,7 +1,19 @@
-xxxx-yy-zz v1.4.0 "Changes for next release"
-  vpxenc is changed to use VP9 by default.
-  Encoder controls added for 1 pass SVC.
-  Decoder control to toggle on/off loopfilter.
+2015-11-09 v1.5.0 "Javan Whistling Duck"
+  This release improves upon the VP9 encoder and speeds up the encoding and
+  decoding processes.
+
+  - Upgrading:
+    This release is ABI incompatible with 1.4.0. It drops deprecated VP8
+    controls and adds a variety of VP9 controls for testing.
+
+    The vpxenc utility now prefers VP9 by default.
+
+  - Enhancements:
+    Faster VP9 encoding and decoding
+    Smaller library size by combining functions used by VP8 and VP9
+
+  - Bug Fixes:
+    A variety of fuzzing issues
 
 2015-04-03 v1.4.0 "Indian Runner Duck"
   This release includes significant improvements to the VP9 codec.
diff --git a/libvpx/build/make/Makefile b/libvpx/build/make/Makefile
index f1b1cca3..3081a926 100644
--- a/libvpx/build/make/Makefile
+++ b/libvpx/build/make/Makefile
@@ -140,6 +140,8 @@ $(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(STACKREALIGN)
 $(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(STACKREALIGN)
 $(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(STACKREALIGN)
 $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(STACKREALIGN)
+$(BUILD_PFX)%vp9_reconintra.c.d: CFLAGS += $(STACKREALIGN)
+$(BUILD_PFX)%vp9_reconintra.c.o: CFLAGS += $(STACKREALIGN)
 
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
@@ -285,7 +287,7 @@ define archive_template
 # for creating them.
 $(1):
 	$(if $(quiet),@echo "    [AR] $$@")
-	$(qexec)$$(AR) $$(ARFLAGS) $$@ $$?
+	$(qexec)$$(AR) $$(ARFLAGS) $$@ $$^
 endef
 
 define so_template
diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh
index 688fa12c..c592b638 100755
--- a/libvpx/build/make/configure.sh
+++ b/libvpx/build/make/configure.sh
@@ -73,6 +73,7 @@ Build options:
   --target=TARGET             target platform tuple [generic-gnu]
   --cpu=CPU                   optimize for a specific cpu rather than a family
   --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
+  --extra-cxxflags=ECXXFLAGS  add ECXXFLAGS to CXXFLAGS [$CXXFLAGS]
   ${toggle_extra_warnings}    emit harmless warnings (always non-fatal)
   ${toggle_werror}            treat warnings as errors, if possible
                               (not available with all compilers)
@@ -200,6 +201,10 @@ disabled(){
   eval test "x\$$1" = "xno"
 }
 
+# Iterates through positional parameters, checks to confirm the parameter has
+# not been explicitly (force) disabled, and enables the setting controlled by
+# the parameter when the setting is not disabled.
+# Note: Does NOT alter RTCD generation options ($RTCD_OPTIONS).
 soft_enable() {
   for var in $*; do
     if ! disabled $var; then
@@ -209,6 +214,10 @@ soft_enable() {
   done
 }
 
+# Iterates through positional parameters, checks to confirm the parameter has
+# not been explicitly (force) enabled, and disables the setting controlled by
+# the parameter when the setting is not enabled.
+# Note: Does NOT alter RTCD generation options ($RTCD_OPTIONS).
 soft_disable() {
   for var in $*; do
     if ! enabled $var; then
@@ -337,6 +346,10 @@ check_add_cflags() {
   check_cflags "$@" && add_cflags_only "$@"
 }
 
+check_add_cxxflags() {
+  check_cxxflags "$@" && add_cxxflags_only "$@"
+}
+
 check_add_asflags() {
   log add_asflags "$@"
   add_asflags "$@"
@@ -428,7 +441,7 @@ NM=${NM}
 
 CFLAGS  = ${CFLAGS}
 CXXFLAGS  = ${CXXFLAGS}
-ARFLAGS = -rus\$(if \$(quiet),c,v)
+ARFLAGS = -crs\$(if \$(quiet),,v)
 LDFLAGS = ${LDFLAGS}
 ASFLAGS = ${ASFLAGS}
 extralibs = ${extralibs}
@@ -503,6 +516,9 @@ process_common_cmdline() {
       --extra-cflags=*)
         extra_cflags="${optval}"
         ;;
+      --extra-cxxflags=*)
+        extra_cxxflags="${optval}"
+        ;;
       --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
         if echo "${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null; then
@@ -617,6 +633,11 @@ show_darwin_sdk_path() {
     xcodebuild -sdk $1 -version Path 2>/dev/null
 }
 
+# Print the major version number of the Darwin SDK specified by $1.
+show_darwin_sdk_major_version() {
+  xcrun --sdk $1 --show-sdk-version 2>/dev/null | cut -d. -f1
+}
+
 process_common_toolchain() {
   if [ -z "$toolchain" ]; then
     gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
@@ -729,13 +750,14 @@ process_common_toolchain() {
   # platforms, so use the newest one available.
   case ${toolchain} in
     arm*-darwin*)
-      ios_sdk_dir="$(show_darwin_sdk_path iphoneos)"
-      if [ -d "${ios_sdk_dir}" ]; then
-        add_cflags  "-isysroot ${ios_sdk_dir}"
-        add_ldflags "-isysroot ${ios_sdk_dir}"
+      add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
+      iphoneos_sdk_dir="$(show_darwin_sdk_path iphoneos)"
+      if [ -d "${iphoneos_sdk_dir}" ]; then
+        add_cflags  "-isysroot ${iphoneos_sdk_dir}"
+        add_ldflags "-isysroot ${iphoneos_sdk_dir}"
       fi
       ;;
-    *-darwin*)
+    x86*-darwin*)
       osx_sdk_dir="$(show_darwin_sdk_path macosx)"
       if [ -d "${osx_sdk_dir}" ]; then
         add_cflags  "-isysroot ${osx_sdk_dir}"
@@ -811,16 +833,35 @@ process_common_toolchain() {
             die "Disabling neon while keeping neon-asm is not supported"
           fi
           case ${toolchain} in
+            # Apple iOS SDKs no longer support armv6 as of the version 9
+            # release (coincides with release of Xcode 7). Only enable media
+            # when using earlier SDK releases.
             *-darwin*)
-              # Neon is guaranteed on iOS 6+ devices, while old media extensions
-              # no longer assemble with iOS 9 SDK
+              if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then
+                soft_enable media
+              else
+                soft_disable media
+                RTCD_OPTIONS="${RTCD_OPTIONS}--disable-media "
+              fi
               ;;
             *)
               soft_enable media
+              ;;
           esac
           ;;
         armv6)
-          soft_enable media
+          case ${toolchain} in
+            *-darwin*)
+              if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then
+                soft_enable media
+              else
+                die "Your iOS SDK does not support armv6."
+              fi
+              ;;
+            *)
+              soft_enable media
+              ;;
+          esac
           ;;
       esac
 
@@ -1003,6 +1044,12 @@ EOF
           done
 
           asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
+
+          if [ "$(show_darwin_sdk_major_version iphoneos)" -gt 8 ]; then
+            check_add_cflags -fembed-bitcode
+            check_add_asflags -fembed-bitcode
+            check_add_ldflags -fembed-bitcode
+          fi
           ;;
 
         linux*)
@@ -1081,7 +1128,9 @@ EOF
           CROSS=${CROSS:-g}
           ;;
         os2)
+          disable_feature pic
           AS=${AS:-nasm}
+          add_ldflags -Zhigh-mem
           ;;
       esac
 
@@ -1171,7 +1220,8 @@ EOF
               && AS=""
           fi
           [ "${AS}" = auto ] || [ -z "${AS}" ] \
-            && die "Neither yasm nor nasm have been found"
+            && die "Neither yasm nor nasm have been found." \
+                   "See the prerequisites section in the README for more info."
           ;;
       esac
       log_echo "  using $AS"
@@ -1210,6 +1260,13 @@ EOF
           enabled x86 && sim_arch="-arch i386" || sim_arch="-arch x86_64"
           add_cflags  ${sim_arch}
           add_ldflags ${sim_arch}
+
+          if [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then
+            # yasm v1.3.0 doesn't know what -fembed-bitcode means, so turning it
+            # on is pointless (unless building a C-only lib). Warn the user, but
+            # do nothing here.
+            log "Warning: Bitcode embed disabled for simulator targets."
+          fi
           ;;
         os2)
           add_asflags -f aout
@@ -1323,12 +1380,6 @@ EOF
     add_cflags -D_LARGEFILE_SOURCE
     add_cflags -D_FILE_OFFSET_BITS=64
   fi
-
-  # append any user defined extra cflags
-  if [ -n "${extra_cflags}" ] ; then
-    check_add_cflags ${extra_cflags} || \
-    die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
-  fi
 }
 
 process_toolchain() {
diff --git a/libvpx/build/make/iosbuild.sh b/libvpx/build/make/iosbuild.sh
index 89fa6818..6f7180d0 100755
--- a/libvpx/build/make/iosbuild.sh
+++ b/libvpx/build/make/iosbuild.sh
@@ -25,7 +25,6 @@ CONFIGURE_ARGS="--disable-docs
 DIST_DIR="_dist"
 FRAMEWORK_DIR="VPX.framework"
 HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
-MAKE_JOBS=1
 SCRIPT_DIR=$(dirname "$0")
 LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
 LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
@@ -41,15 +40,24 @@ TARGETS="arm64-darwin-gcc
 build_target() {
   local target="$1"
   local old_pwd="$(pwd)"
+  local target_specific_flags=""
 
   vlog "***Building target: ${target}***"
 
+  case "${target}" in
+    x86-*)
+      target_specific_flags="--enable-pic"
+      vlog "Enabled PIC for ${target}"
+      ;;
+  esac
+
   mkdir "${target}"
   cd "${target}"
   eval "${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
-    ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${devnull}
+    ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${target_specific_flags} \
+    ${devnull}
   export DIST_DIR
-  eval make -j ${MAKE_JOBS} dist ${devnull}
+  eval make dist ${devnull}
   cd "${old_pwd}"
 
   vlog "***Done building target: ${target}***"
@@ -194,11 +202,12 @@ cat << EOF
   Usage: ${0##*/} [arguments]
     --help: Display this message and exit.
     --extra-configure-args <args>: Extra args to pass when configuring libvpx.
-    --jobs: Number of make jobs.
     --preserve-build-output: Do not delete the build directory.
     --show-build-output: Show output from each library build.
     --targets <targets>: Override default target list. Defaults:
          ${TARGETS}
+    --test-link: Confirms all targets can be linked. Functionally identical to
+                 passing --enable-examples via --extra-configure-args.
     --verbose: Output information about the environment and each stage of the
                build.
 EOF
@@ -227,16 +236,15 @@ while [ -n "$1" ]; do
       iosbuild_usage
       exit
       ;;
-    --jobs)
-      MAKE_JOBS="$2"
-      shift
-      ;;
     --preserve-build-output)
       PRESERVE_BUILD_OUTPUT=yes
       ;;
     --show-build-output)
       devnull=
       ;;
+    --test-link)
+      EXTRA_CONFIGURE_ARGS="${EXTRA_CONFIGURE_ARGS} --enable-examples"
+      ;;
     --targets)
       TARGETS="$2"
       shift
@@ -260,11 +268,11 @@ cat << EOF
   EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
   FRAMEWORK_DIR=${FRAMEWORK_DIR}
   HEADER_DIR=${HEADER_DIR}
-  MAKE_JOBS=${MAKE_JOBS}
-  PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
   LIBVPX_SOURCE_DIR=${LIBVPX_SOURCE_DIR}
   LIPO=${LIPO}
+  MAKEFLAGS=${MAKEFLAGS}
   ORIG_PWD=${ORIG_PWD}
+  PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
   TARGETS="${TARGETS}"
 EOF
 fi
diff --git a/libvpx/configure b/libvpx/configure
index ac196dac..a40f3abb 100755
--- a/libvpx/configure
+++ b/libvpx/configure
@@ -264,6 +264,7 @@ EXPERIMENT_LIST="
     spatial_svc
     fp_mb_stats
     emulate_hardware
+    misc_fixes
 "
 CONFIG_LIST="
     dependency_tracking
@@ -716,6 +717,16 @@ EOF
     esac
     # libwebm needs to be linked with C++ standard library
     enabled webm_io && LD=${CXX}
+
+    # append any user defined extra cflags
+    if [ -n "${extra_cflags}" ] ; then
+        check_add_cflags ${extra_cflags} || \
+        die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
+    fi
+    if [ -n "${extra_cxxflags}" ]; then
+        check_add_cxxflags ${extra_cxxflags} || \
+        die "Requested extra CXXFLAGS '${extra_cxxflags}' not supported by compiler"
+    fi
 }
 
 
diff --git a/libvpx/examples.mk b/libvpx/examples.mk
index dfa5a654..f10bec68 100644
--- a/libvpx/examples.mk
+++ b/libvpx/examples.mk
@@ -36,6 +36,8 @@ LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
                 third_party/libyuv/source/scale_neon64.cc \
                 third_party/libyuv/source/scale_win.cc \
 
+LIBWEBM_COMMON_SRCS += third_party/libwebm/webmids.hpp
+
 LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
                       third_party/libwebm/mkvmuxerutil.cpp \
                       third_party/libwebm/mkvwriter.cpp \
@@ -43,8 +45,7 @@ LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
                       third_party/libwebm/mkvmuxertypes.hpp \
                       third_party/libwebm/mkvmuxerutil.hpp \
                       third_party/libwebm/mkvparser.hpp \
-                      third_party/libwebm/mkvwriter.hpp \
-                      third_party/libwebm/webmids.hpp
+                      third_party/libwebm/mkvwriter.hpp
 
 LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser.cpp \
                       third_party/libwebm/mkvreader.cpp \
@@ -68,6 +69,7 @@ ifeq ($(CONFIG_LIBYUV),yes)
   vpxdec.SRCS                 += $(LIBYUV_SRCS)
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
+  vpxdec.SRCS                 += $(LIBWEBM_COMMON_SRCS)
   vpxdec.SRCS                 += $(LIBWEBM_PARSER_SRCS)
   vpxdec.SRCS                 += webmdec.cc webmdec.h
 endif
@@ -89,6 +91,7 @@ ifeq ($(CONFIG_LIBYUV),yes)
   vpxenc.SRCS                 += $(LIBYUV_SRCS)
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
+  vpxenc.SRCS                 += $(LIBWEBM_COMMON_SRCS)
   vpxenc.SRCS                 += $(LIBWEBM_MUXER_SRCS)
   vpxenc.SRCS                 += webmenc.cc webmenc.h
 endif
diff --git a/libvpx/examples/vp9_spatial_svc_encoder.c b/libvpx/examples/vp9_spatial_svc_encoder.c
index 5a609766..b26e9873 100644
--- a/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -25,6 +25,7 @@
 #include "../tools_common.h"
 #include "../video_writer.h"
 
+#include "../vpx_ports/vpx_timer.h"
 #include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
@@ -79,6 +80,8 @@ static const arg_def_t rc_end_usage_arg =
     ARG_DEF(NULL, "rc-end-usage", 1, "0 - 3: VBR, CBR, CQ, Q");
 static const arg_def_t speed_arg =
     ARG_DEF("sp", "speed", 1, "speed configuration");
+static const arg_def_t aqmode_arg =
+    ARG_DEF("aq", "aqmode", 1, "aq-mode off/on");
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@@ -100,7 +103,7 @@ static const arg_def_t *svc_args[] = {
   &kf_dist_arg,       &scale_factors_arg, &passes_arg,      &pass_arg,
   &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
   &max_bitrate_arg,   &temporal_layers_arg, &temporal_layering_mode_arg,
-  &lag_in_frame_arg,  &threads_arg,
+  &lag_in_frame_arg,  &threads_arg,       &aqmode_arg,
 #if OUTPUT_RC_STATS
   &output_rc_stats_arg,
 #endif
@@ -220,6 +223,8 @@ static void parse_command_line(int argc, const char **argv_,
 #endif
     } else if (arg_match(&arg, &speed_arg, argi)) {
       svc_ctx->speed = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &aqmode_arg, argi)) {
+      svc_ctx->aqmode = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &threads_arg, argi)) {
       svc_ctx->threads = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &temporal_layering_mode_arg, argi)) {
@@ -539,6 +544,59 @@ vpx_codec_err_t parse_superframe_index(const uint8_t *data,
 }
 #endif
 
+// Example pattern for spatial layers and 2 temporal layers used in the
+// bypass/flexible mode. The pattern corresponds to the pattern
+// VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
+// non-flexible mode.
+void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
+                                 int is_key_frame,
+                                 vpx_svc_ref_frame_config_t *ref_frame_config) {
+  for (sl = 0; sl < num_spatial_layers; ++sl) {
+    if (!tl) {
+      if (!sl) {
+        ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_GF |
+                                            VP8_EFLAG_NO_REF_ARF |
+                                            VP8_EFLAG_NO_UPD_GF |
+                                            VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        if (is_key_frame) {
+          ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_LAST |
+                                              VP8_EFLAG_NO_REF_ARF |
+                                              VP8_EFLAG_NO_UPD_GF |
+                                              VP8_EFLAG_NO_UPD_ARF;
+        } else {
+        ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_ARF |
+                                            VP8_EFLAG_NO_UPD_GF |
+                                            VP8_EFLAG_NO_UPD_ARF;
+        }
+      }
+    } else if (tl == 1) {
+      if (!sl) {
+        ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_GF |
+                                            VP8_EFLAG_NO_REF_ARF |
+                                            VP8_EFLAG_NO_UPD_LAST |
+                                            VP8_EFLAG_NO_UPD_GF;
+      } else {
+        ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_ARF |
+                                            VP8_EFLAG_NO_UPD_LAST |
+                                            VP8_EFLAG_NO_UPD_GF;
+      }
+    }
+    if (tl == 0) {
+      ref_frame_config->lst_fb_idx[sl] = sl;
+      if (sl)
+        ref_frame_config->gld_fb_idx[sl] = sl - 1;
+      else
+        ref_frame_config->gld_fb_idx[sl] = 0;
+      ref_frame_config->alt_fb_idx[sl] = 0;
+    } else if (tl == 1) {
+      ref_frame_config->lst_fb_idx[sl] = sl;
+      ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1;
+      ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
+    }
+  }
+}
+
 int main(int argc, const char **argv) {
   AppInput app_input = {0};
   VpxVideoWriter *writer = NULL;
@@ -559,11 +617,14 @@ int main(int argc, const char **argv) {
   VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL};
   struct RateControlStats rc;
   vpx_svc_layer_id_t layer_id;
+  vpx_svc_ref_frame_config_t ref_frame_config;
   int sl, tl;
   double sum_bitrate = 0.0;
   double sum_bitrate2 = 0.0;
   double framerate  = 30.0;
 #endif
+  struct vpx_usec_timer timer;
+  int64_t cx_time = 0;
   memset(&svc_ctx, 0, sizeof(svc_ctx));
   svc_ctx.log_print = 1;
   exec_name = argv[0];
@@ -632,6 +693,9 @@ int main(int argc, const char **argv) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
   if (svc_ctx.threads)
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
+  if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1)
+    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+
 
   // Encode frames
   while (!end_of_stream) {
@@ -643,9 +707,36 @@ int main(int argc, const char **argv) {
       end_of_stream = 1;
     }
 
+    // For BYPASS/FLEXIBLE mode, set the frame flags (reference and updates)
+    // and the buffer indices for each spatial layer of the current
+    // (super)frame to be encoded. The temporal layer_id for the current frame
+    // also needs to be set.
+    // TODO(marpan): Should rename the "VP9E_TEMPORAL_LAYERING_MODE_BYPASS"
+    // mode to "VP9E_LAYERING_MODE_BYPASS".
+    if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      // Example for 2 temporal layers.
+      if (frame_cnt % 2 == 0)
+        layer_id.temporal_layer_id = 0;
+      else
+        layer_id.temporal_layer_id = 1;
+      // Note that we only set the temporal layer_id, since we are calling
+      // the encode for the whole superframe. The encoder will internally loop
+      // over all the spatial layers for the current superframe.
+      vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(sl, layer_id.temporal_layer_id,
+                                  svc_ctx.spatial_layers,
+                                  frame_cnt == 0,
+                                  &ref_frame_config);
+      vpx_codec_control(&codec, VP9E_SET_SVC_REF_FRAME_CONFIG,
+                        &ref_frame_config);
+    }
+
+    vpx_usec_timer_start(&timer);
     res = vpx_svc_encode(&svc_ctx, &codec, (end_of_stream ? NULL : &raw),
                          pts, frame_duration, svc_ctx.speed >= 5 ?
                          VPX_DL_REALTIME : VPX_DL_GOOD_QUALITY);
+    vpx_usec_timer_mark(&timer);
+    cx_time += vpx_usec_timer_elapsed(&timer);
 
     printf("%s", vpx_svc_get_message(&svc_ctx));
     if (res != VPX_CODEC_OK) {
@@ -784,6 +875,10 @@ int main(int argc, const char **argv) {
     }
   }
 #endif
+  printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n",
+         frame_cnt,
+         1000 * (float)cx_time / (double)(frame_cnt * 1000000),
+         1000000 * (double)frame_cnt / (double)cx_time);
   vpx_img_free(&raw);
   // display average size, psnr
   printf("%s", vpx_svc_dump_statistics(&svc_ctx));
diff --git a/libvpx/examples/vpx_temporal_svc_encoder.c b/libvpx/examples/vpx_temporal_svc_encoder.c
index ee7de6b7..5adda9ee 100644
--- a/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/libvpx/examples/vpx_temporal_svc_encoder.c
@@ -684,14 +684,14 @@ int main(int argc, char **argv) {
   if (strncmp(encoder->name, "vp8", 3) == 0) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
     vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
-    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
     vpx_svc_extra_cfg_t svc_params;
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
     vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
     vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
     vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);
-    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
     vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
     if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1: 0))
diff --git a/libvpx/libs.mk b/libvpx/libs.mk
index b9d4b286..f28d84a5 100644
--- a/libvpx/libs.mk
+++ b/libvpx/libs.mk
@@ -53,7 +53,7 @@ CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS))
 include $(SRC_PATH_BARE)/vpx_util/vpx_util.mk
 CODEC_SRCS-yes += $(addprefix vpx_util/,$(call enabled,UTIL_SRCS))
 
-ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
+ifeq ($(CONFIG_VP8),yes)
   VP8_PREFIX=vp8/
   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
 endif
@@ -76,7 +76,7 @@ ifeq ($(CONFIG_VP8_DECODER),yes)
   CODEC_DOC_SECTIONS += vp8 vp8_decoder
 endif
 
-ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
+ifeq ($(CONFIG_VP9),yes)
   VP9_PREFIX=vp9/
   include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk
 endif
@@ -110,7 +110,7 @@ VP9_PREFIX=vp9/
 $(BUILD_PFX)$(VP9_PREFIX)%.c.o: CFLAGS += -Wextra
 
 #  VP10 make file
-ifneq ($(CONFIG_VP10_ENCODER)$(CONFIG_VP10_DECODER),)
+ifeq ($(CONFIG_VP10),yes)
   VP10_PREFIX=vp10/
   include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10_common.mk
 endif
@@ -260,7 +260,7 @@ OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
-SO_VERSION_MAJOR := 2
+SO_VERSION_MAJOR := 3
 SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
diff --git a/libvpx/test/active_map_refresh_test.cc b/libvpx/test/active_map_refresh_test.cc
new file mode 100644
index 00000000..c9456614
--- /dev/null
+++ b/libvpx/test/active_map_refresh_test.cc
@@ -0,0 +1,127 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <algorithm>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+// Check if any pixel in a 16x16 macroblock varies between frames.
+int CheckMb(const vpx_image_t &current, const vpx_image_t &previous,
+            int mb_r, int mb_c) {
+  for (int plane = 0; plane < 3; plane++) {
+    int r = 16 * mb_r;
+    int c0 = 16 * mb_c;
+    int r_top = std::min(r + 16, static_cast<int>(current.d_h));
+    int c_top = std::min(c0 + 16, static_cast<int>(current.d_w));
+    r = std::max(r, 0);
+    c0 = std::max(c0, 0);
+    if (plane > 0 && current.x_chroma_shift) {
+      c_top = (c_top + 1) >> 1;
+      c0 >>= 1;
+    }
+    if (plane > 0 && current.y_chroma_shift) {
+      r_top = (r_top + 1) >> 1;
+      r >>= 1;
+    }
+    for (; r < r_top; ++r) {
+      for (int c = c0; c < c_top; ++c) {
+        if (current.planes[plane][current.stride[plane] * r + c] !=
+            previous.planes[plane][previous.stride[plane] * r + c])
+          return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+void GenerateMap(int mb_rows, int mb_cols, const vpx_image_t &current,
+                 const vpx_image_t &previous, uint8_t *map) {
+  for (int mb_r = 0; mb_r < mb_rows; ++mb_r) {
+    for (int mb_c = 0; mb_c < mb_cols; ++mb_c) {
+      map[mb_r * mb_cols + mb_c] = CheckMb(current, previous, mb_r, mb_c);
+    }
+  }
+}
+
+const int kAqModeCyclicRefresh = 3;
+
+class ActiveMapRefreshTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ActiveMapRefreshTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    ::libvpx_test::Y4mVideoSource *y4m_video =
+        static_cast<libvpx_test::Y4mVideoSource *>(video);
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP9E_SET_AQ_MODE, kAqModeCyclicRefresh);
+    } else if (video->frame() >= 2 && video->img()) {
+      vpx_image_t *current = video->img();
+      vpx_image_t *previous = y4m_holder_->img();
+      ASSERT_TRUE(previous != NULL);
+      vpx_active_map_t map = vpx_active_map_t();
+      const int width = static_cast<int>(current->d_w);
+      const int height = static_cast<int>(current->d_h);
+      const int mb_width = (width + 15) / 16;
+      const int mb_height = (height + 15) / 16;
+      uint8_t *active_map = new uint8_t[mb_width * mb_height];
+      GenerateMap(mb_height, mb_width, *current, *previous, active_map);
+      map.cols = mb_width;
+      map.rows = mb_height;
+      map.active_map = active_map;
+      encoder->Control(VP8E_SET_ACTIVEMAP, &map);
+      delete[] active_map;
+    }
+    if (video->img()) {
+      y4m_video->SwapBuffers(y4m_holder_);
+    }
+  }
+
+  int cpu_used_;
+  ::libvpx_test::Y4mVideoSource *y4m_holder_;
+};
+
+TEST_P(ActiveMapRefreshTest, Test) {
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_profile = 1;
+  cfg_.rc_target_bitrate = 600;
+  cfg_.rc_resize_allowed = 0;
+  cfg_.rc_min_quantizer = 8;
+  cfg_.rc_max_quantizer = 30;
+  cfg_.g_pass = VPX_RC_ONE_PASS;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.kf_max_dist = 90000;
+
+  ::libvpx_test::Y4mVideoSource video("desktop_credits.y4m", 0, 30);
+  ::libvpx_test::Y4mVideoSource video_holder("desktop_credits.y4m", 0, 30);
+  video_holder.Begin();
+  y4m_holder_ = &video_holder;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+VP9_INSTANTIATE_TEST_CASE(ActiveMapRefreshTest,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Range(5, 6));
+}  // namespace
diff --git a/libvpx/test/convolve_test.cc b/libvpx/test/convolve_test.cc
index e0e929e6..08267882 100644
--- a/libvpx/test/convolve_test.cc
+++ b/libvpx/test/convolve_test.cc
@@ -960,511 +960,72 @@ TEST_P(ConvolveTest, CheckScalingFiltering) {
 using std::tr1::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-#if HAVE_SSE2 && ARCH_X86_64
-void wrap_convolve8_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x,
-                                 int filter_x_stride,
-                                 const int16_t *filter_y,
-                                 int filter_y_stride,
-                                 int w, int h) {
-  vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
-                                  filter_x_stride, filter_y, filter_y_stride,
-                                  w, h, 8);
-}
-
-void wrap_convolve8_avg_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x,
-                                     int filter_x_stride,
-                                     const int16_t *filter_y,
-                                     int filter_y_stride,
-                                     int w, int h) {
-  vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
-                                      filter_x, filter_x_stride,
-                                      filter_y, filter_y_stride, w, h, 8);
-}
-
-void wrap_convolve8_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x,
-                                int filter_x_stride,
-                                const int16_t *filter_y,
-                                int filter_y_stride,
-                                int w, int h) {
-  vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
-                                 filter_x, filter_x_stride,
-                                 filter_y, filter_y_stride, w, h, 8);
-}
-
-void wrap_convolve8_avg_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x,
-                                    int filter_x_stride,
-                                    const int16_t *filter_y,
-                                    int filter_y_stride,
-                                    int w, int h) {
-  vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
-                                     filter_x, filter_x_stride,
-                                     filter_y, filter_y_stride, w, h, 8);
-}
-
-void wrap_convolve8_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x,
-                           int filter_x_stride,
-                           const int16_t *filter_y,
-                           int filter_y_stride,
-                           int w, int h) {
-  vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
-                            filter_x, filter_x_stride,
-                            filter_y, filter_y_stride, w, h, 8);
-}
-
-void wrap_convolve8_avg_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x,
-                               int filter_x_stride,
-                               const int16_t *filter_y,
-                               int filter_y_stride,
-                               int w, int h) {
-  vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
-                                filter_x, filter_x_stride,
-                                filter_y, filter_y_stride, w, h, 8);
-}
-
-void wrap_convolve8_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x,
-                                  int filter_x_stride,
-                                  const int16_t *filter_y,
-                                  int filter_y_stride,
-                                  int w, int h) {
-  vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
-                                  filter_x, filter_x_stride,
-                                  filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve8_avg_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x,
-                                      int filter_x_stride,
-                                      const int16_t *filter_y,
-                                      int filter_y_stride,
-                                      int w, int h) {
-  vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
-                                      filter_x, filter_x_stride,
-                                      filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve8_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x,
-                                 int filter_x_stride,
-                                 const int16_t *filter_y,
-                                 int filter_y_stride,
-                                 int w, int h) {
-  vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
-                                 filter_x, filter_x_stride,
-                                 filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve8_avg_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x,
-                                     int filter_x_stride,
-                                     const int16_t *filter_y,
-                                     int filter_y_stride,
-                                     int w, int h) {
-  vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
-                                     filter_x, filter_x_stride,
-                                     filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve8_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x,
-                            int filter_x_stride,
-                            const int16_t *filter_y,
-                            int filter_y_stride,
-                            int w, int h) {
-  vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
-                            filter_x, filter_x_stride,
-                            filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve8_avg_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x,
-                                int filter_x_stride,
-                                const int16_t *filter_y,
-                                int filter_y_stride,
-                                int w, int h) {
-  vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
-                                filter_x, filter_x_stride,
-                                filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve8_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x,
-                                  int filter_x_stride,
-                                  const int16_t *filter_y,
-                                  int filter_y_stride,
-                                  int w, int h) {
-  vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
-                                  filter_x, filter_x_stride,
-                                  filter_y, filter_y_stride, w, h, 12);
-}
-
-void wrap_convolve8_avg_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x,
-                                      int filter_x_stride,
-                                      const int16_t *filter_y,
-                                      int filter_y_stride,
-                                      int w, int h) {
-  vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
-                                      filter_x, filter_x_stride,
-                                      filter_y, filter_y_stride, w, h, 12);
-}
-
-void wrap_convolve8_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x,
-                                 int filter_x_stride,
-                                 const int16_t *filter_y,
-                                 int filter_y_stride,
-                                 int w, int h) {
-  vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
-                                 filter_x, filter_x_stride,
-                                 filter_y, filter_y_stride, w, h, 12);
-}
-
-void wrap_convolve8_avg_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x,
-                                     int filter_x_stride,
-                                     const int16_t *filter_y,
-                                     int filter_y_stride,
-                                     int w, int h) {
-  vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
-                                     filter_x, filter_x_stride,
-                                     filter_y, filter_y_stride, w, h, 12);
-}
-
-void wrap_convolve8_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x,
-                            int filter_x_stride,
-                            const int16_t *filter_y,
-                            int filter_y_stride,
-                            int w, int h) {
-  vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
-                            filter_x, filter_x_stride,
-                            filter_y, filter_y_stride, w, h, 12);
-}
-
-void wrap_convolve8_avg_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x,
-                                int filter_x_stride,
-                                const int16_t *filter_y,
-                                int filter_y_stride,
-                                int w, int h) {
-  vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
-                                filter_x, filter_x_stride,
-                                filter_y, filter_y_stride, w, h, 12);
+#define WRAP(func, bd) \
+void wrap_ ## func ## _ ## bd(const uint8_t *src, ptrdiff_t src_stride, \
+                              uint8_t *dst, ptrdiff_t dst_stride, \
+                              const int16_t *filter_x, \
+                              int filter_x_stride, \
+                              const int16_t *filter_y, \
+                              int filter_y_stride, \
+                              int w, int h) { \
+  vpx_highbd_ ## func(src, src_stride, dst, dst_stride, filter_x, \
+                      filter_x_stride, filter_y, filter_y_stride, \
+                      w, h, bd); \
 }
+#if HAVE_SSE2 && ARCH_X86_64
+#if CONFIG_USE_X86INC
+WRAP(convolve_copy_sse2, 8)
+WRAP(convolve_avg_sse2, 8)
+WRAP(convolve_copy_sse2, 10)
+WRAP(convolve_avg_sse2, 10)
+WRAP(convolve_copy_sse2, 12)
+WRAP(convolve_avg_sse2, 12)
+#endif  // CONFIG_USE_X86INC
+WRAP(convolve8_horiz_sse2, 8)
+WRAP(convolve8_avg_horiz_sse2, 8)
+WRAP(convolve8_vert_sse2, 8)
+WRAP(convolve8_avg_vert_sse2, 8)
+WRAP(convolve8_sse2, 8)
+WRAP(convolve8_avg_sse2, 8)
+WRAP(convolve8_horiz_sse2, 10)
+WRAP(convolve8_avg_horiz_sse2, 10)
+WRAP(convolve8_vert_sse2, 10)
+WRAP(convolve8_avg_vert_sse2, 10)
+WRAP(convolve8_sse2, 10)
+WRAP(convolve8_avg_sse2, 10)
+WRAP(convolve8_horiz_sse2, 12)
+WRAP(convolve8_avg_horiz_sse2, 12)
+WRAP(convolve8_vert_sse2, 12)
+WRAP(convolve8_avg_vert_sse2, 12)
+WRAP(convolve8_sse2, 12)
+WRAP(convolve8_avg_sse2, 12)
 #endif  // HAVE_SSE2 && ARCH_X86_64
 
-void wrap_convolve_copy_c_8(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x,
-                            int filter_x_stride,
-                            const int16_t *filter_y,
-                            int filter_y_stride,
-                            int w, int h) {
-  vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
-                             filter_x, filter_x_stride,
-                             filter_y, filter_y_stride, w, h, 8);
-}
-
-void wrap_convolve_avg_c_8(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x,
-                           int filter_x_stride,
-                           const int16_t *filter_y,
-                           int filter_y_stride,
-                           int w, int h) {
-  vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
-                            filter_x, filter_x_stride,
-                            filter_y, filter_y_stride, w, h, 8);
-}
-
-void wrap_convolve8_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x,
-                              int filter_x_stride,
-                              const int16_t *filter_y,
-                              int filter_y_stride,
-                              int w, int h) {
-  vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                               filter_x, filter_x_stride,
-                               filter_y, filter_y_stride, w, h, 8);
-}
-
-void wrap_convolve8_avg_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x,
-                                  int filter_x_stride,
-                                  const int16_t *filter_y,
-                                  int filter_y_stride,
-                                  int w, int h) {
-  vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
-                                   filter_x, filter_x_stride,
-                                   filter_y, filter_y_stride, w, h, 8);
-}
-
-void wrap_convolve8_vert_c_8(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x,
-                             int filter_x_stride,
-                             const int16_t *filter_y,
-                             int filter_y_stride,
-                             int w, int h) {
-  vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                              filter_x, filter_x_stride,
-                              filter_y, filter_y_stride, w, h, 8);
-}
-
-void wrap_convolve8_avg_vert_c_8(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x,
-                                 int filter_x_stride,
-                                 const int16_t *filter_y,
-                                 int filter_y_stride,
-                                 int w, int h) {
-  vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
-                                  filter_x, filter_x_stride,
-                                  filter_y, filter_y_stride, w, h, 8);
-}
-
-void wrap_convolve8_c_8(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x,
-                        int filter_x_stride,
-                        const int16_t *filter_y,
-                        int filter_y_stride,
-                        int w, int h) {
-  vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
-                         filter_x, filter_x_stride,
-                         filter_y, filter_y_stride, w, h, 8);
-}
-
-void wrap_convolve8_avg_c_8(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x,
-                            int filter_x_stride,
-                            const int16_t *filter_y,
-                            int filter_y_stride,
-                            int w, int h) {
-  vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                             filter_x, filter_x_stride,
-                             filter_y, filter_y_stride, w, h, 8);
-}
-
-void wrap_convolve_copy_c_10(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x,
-                             int filter_x_stride,
-                             const int16_t *filter_y,
-                             int filter_y_stride,
-                             int w, int h) {
-  vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
-                             filter_x, filter_x_stride,
-                             filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve_avg_c_10(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x,
-                            int filter_x_stride,
-                            const int16_t *filter_y,
-                            int filter_y_stride,
-                            int w, int h) {
-  vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
-                            filter_x, filter_x_stride,
-                            filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve8_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x,
-                               int filter_x_stride,
-                               const int16_t *filter_y,
-                               int filter_y_stride,
-                               int w, int h) {
-  vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                               filter_x, filter_x_stride,
-                               filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve8_avg_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x,
-                                   int filter_x_stride,
-                                   const int16_t *filter_y,
-                                   int filter_y_stride,
-                                   int w, int h) {
-  vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
-                                   filter_x, filter_x_stride,
-                                   filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve8_vert_c_10(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x,
-                              int filter_x_stride,
-                              const int16_t *filter_y,
-                              int filter_y_stride,
-                              int w, int h) {
-  vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                              filter_x, filter_x_stride,
-                              filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve8_avg_vert_c_10(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x,
-                                  int filter_x_stride,
-                                  const int16_t *filter_y,
-                                  int filter_y_stride,
-                                  int w, int h) {
-  vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
-                                  filter_x, filter_x_stride,
-                                  filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve8_c_10(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x,
-                         int filter_x_stride,
-                         const int16_t *filter_y,
-                         int filter_y_stride,
-                         int w, int h) {
-  vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
-                         filter_x, filter_x_stride,
-                         filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve8_avg_c_10(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x,
-                             int filter_x_stride,
-                             const int16_t *filter_y,
-                             int filter_y_stride,
-                             int w, int h) {
-  vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                             filter_x, filter_x_stride,
-                             filter_y, filter_y_stride, w, h, 10);
-}
-
-void wrap_convolve_copy_c_12(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x,
-                             int filter_x_stride,
-                             const int16_t *filter_y,
-                             int filter_y_stride,
-                             int w, int h) {
-  vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
-                             filter_x, filter_x_stride,
-                             filter_y, filter_y_stride, w, h, 12);
-}
-
-void wrap_convolve_avg_c_12(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x,
-                            int filter_x_stride,
-                            const int16_t *filter_y,
-                            int filter_y_stride,
-                            int w, int h) {
-  vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
-                            filter_x, filter_x_stride,
-                            filter_y, filter_y_stride, w, h, 12);
-}
-
-void wrap_convolve8_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x,
-                               int filter_x_stride,
-                               const int16_t *filter_y,
-                               int filter_y_stride,
-                               int w, int h) {
-  vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                               filter_x, filter_x_stride,
-                               filter_y, filter_y_stride, w, h, 12);
-}
-
-void wrap_convolve8_avg_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x,
-                                   int filter_x_stride,
-                                   const int16_t *filter_y,
-                                   int filter_y_stride,
-                                   int w, int h) {
-  vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
-                                   filter_x, filter_x_stride,
-                                   filter_y, filter_y_stride, w, h, 12);
-}
-
-void wrap_convolve8_vert_c_12(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x,
-                              int filter_x_stride,
-                              const int16_t *filter_y,
-                              int filter_y_stride,
-                              int w, int h) {
-  vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                              filter_x, filter_x_stride,
-                              filter_y, filter_y_stride, w, h, 12);
-}
-
-void wrap_convolve8_avg_vert_c_12(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x,
-                                  int filter_x_stride,
-                                  const int16_t *filter_y,
-                                  int filter_y_stride,
-                                  int w, int h) {
-  vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
-                                  filter_x, filter_x_stride,
-                                  filter_y, filter_y_stride, w, h, 12);
-}
-
-void wrap_convolve8_c_12(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x,
-                         int filter_x_stride,
-                         const int16_t *filter_y,
-                         int filter_y_stride,
-                         int w, int h) {
-  vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
-                         filter_x, filter_x_stride,
-                         filter_y, filter_y_stride, w, h, 12);
-}
-
-void wrap_convolve8_avg_c_12(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x,
-                             int filter_x_stride,
-                             const int16_t *filter_y,
-                             int filter_y_stride,
-                             int w, int h) {
-  vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                             filter_x, filter_x_stride,
-                             filter_y, filter_y_stride, w, h, 12);
-}
+WRAP(convolve_copy_c, 8)
+WRAP(convolve_avg_c, 8)
+WRAP(convolve8_horiz_c, 8)
+WRAP(convolve8_avg_horiz_c, 8)
+WRAP(convolve8_vert_c, 8)
+WRAP(convolve8_avg_vert_c, 8)
+WRAP(convolve8_c, 8)
+WRAP(convolve8_avg_c, 8)
+WRAP(convolve_copy_c, 10)
+WRAP(convolve_avg_c, 10)
+WRAP(convolve8_horiz_c, 10)
+WRAP(convolve8_avg_horiz_c, 10)
+WRAP(convolve8_vert_c, 10)
+WRAP(convolve8_avg_vert_c, 10)
+WRAP(convolve8_c, 10)
+WRAP(convolve8_avg_c, 10)
+WRAP(convolve_copy_c, 12)
+WRAP(convolve_avg_c, 12)
+WRAP(convolve8_horiz_c, 12)
+WRAP(convolve8_avg_horiz_c, 12)
+WRAP(convolve8_vert_c, 12)
+WRAP(convolve8_avg_vert_c, 12)
+WRAP(convolve8_c, 12)
+WRAP(convolve8_avg_c, 12)
+#undef WRAP
 
 const ConvolveFunctions convolve8_c(
     wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
@@ -1563,7 +1124,11 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
 #if HAVE_SSE2 && ARCH_X86_64
 #if CONFIG_VP9_HIGHBITDEPTH
 const ConvolveFunctions convolve8_sse2(
+#if CONFIG_USE_X86INC
+    wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8,
+#else
     wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
+#endif  // CONFIG_USE_X86INC
     wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
     wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
     wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8,
@@ -1571,7 +1136,11 @@ const ConvolveFunctions convolve8_sse2(
     wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
     wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8);
 const ConvolveFunctions convolve10_sse2(
+#if CONFIG_USE_X86INC
+    wrap_convolve_copy_sse2_10, wrap_convolve_avg_sse2_10,
+#else
     wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
+#endif  // CONFIG_USE_X86INC
     wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
     wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
     wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10,
@@ -1579,7 +1148,11 @@ const ConvolveFunctions convolve10_sse2(
     wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
     wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10);
 const ConvolveFunctions convolve12_sse2(
+#if CONFIG_USE_X86INC
+    wrap_convolve_copy_sse2_12, wrap_convolve_avg_sse2_12,
+#else
     wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
+#endif  // CONFIG_USE_X86INC
     wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
     wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
     wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12,
diff --git a/libvpx/test/dct16x16_test.cc b/libvpx/test/dct16x16_test.cc
index e9de76ad..332210da 100644
--- a/libvpx/test/dct16x16_test.cc
+++ b/libvpx/test/dct16x16_test.cc
@@ -40,30 +40,6 @@ static int round(double x) {
 #endif
 
 const int kNumCoeffs = 256;
-const double PI = 3.1415926535898;
-void reference2_16x16_idct_2d(double *input, double *output) {
-  double x;
-  for (int l = 0; l < 16; ++l) {
-    for (int k = 0; k < 16; ++k) {
-      double s = 0;
-      for (int i = 0; i < 16; ++i) {
-        for (int j = 0; j < 16; ++j) {
-          x = cos(PI * j * (l + 0.5) / 16.0) *
-              cos(PI * i * (k + 0.5) / 16.0) *
-              input[i * 16 + j] / 256;
-          if (i != 0)
-            x *= sqrt(2.0);
-          if (j != 0)
-            x *= sqrt(2.0);
-          s += x;
-        }
-      }
-      output[k*16+l] = s;
-    }
-  }
-}
-
-
 const double C1 = 0.995184726672197;
 const double C2 = 0.98078528040323;
 const double C3 = 0.956940335732209;
diff --git a/libvpx/test/encode_test_driver.cc b/libvpx/test/encode_test_driver.cc
index be4ef9af..128436ee 100644
--- a/libvpx/test/encode_test_driver.cc
+++ b/libvpx/test/encode_test_driver.cc
@@ -195,6 +195,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
     video->Begin();
     encoder->InitEncoder(video);
+    ASSERT_FALSE(::testing::Test::HasFatalFailure());
 
     unsigned long dec_init_flags = 0;  // NOLINT
     // Use fragment decoder if encoder outputs partitions.
diff --git a/libvpx/test/encode_test_driver.h b/libvpx/test/encode_test_driver.h
index 9ecc4989..6d0a72f9 100644
--- a/libvpx/test/encode_test_driver.h
+++ b/libvpx/test/encode_test_driver.h
@@ -124,6 +124,11 @@ class Encoder {
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
 
+  void Control(int ctrl_id, int *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
   void Control(int ctrl_id, struct vpx_scaling_mode *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
diff --git a/libvpx/test/error_resilience_test.cc b/libvpx/test/error_resilience_test.cc
index 9e512adb..9a2ad2f3 100644
--- a/libvpx/test/error_resilience_test.cc
+++ b/libvpx/test/error_resilience_test.cc
@@ -20,10 +20,11 @@ const int kMaxErrorFrames = 12;
 const int kMaxDroppableFrames = 12;
 
 class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
-    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+    public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, bool> {
  protected:
   ErrorResilienceTestLarge()
       : EncoderTest(GET_PARAM(0)),
+        svc_support_(GET_PARAM(2)),
         psnr_(0.0),
         nframes_(0),
         mismatch_psnr_(0.0),
@@ -193,6 +194,8 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
      pattern_switch_ = frame_switch;
    }
 
+  bool svc_support_;
+
  private:
   double psnr_;
   unsigned int nframes_;
@@ -302,6 +305,10 @@ TEST_P(ErrorResilienceTestLarge, DropFramesWithoutRecovery) {
 // two layer temporal pattern. The base layer does not predict from the top
 // layer, so successful decoding is expected.
 TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) {
+  // This test doesn't run if SVC is not supported.
+  if (!svc_support_)
+    return;
+
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
   cfg_.rc_target_bitrate = 500;
@@ -347,6 +354,10 @@ TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) {
 // for a two layer temporal pattern, where at some point in the
 // sequence, the LAST ref is not used anymore.
 TEST_P(ErrorResilienceTestLarge, 2LayersNoRefLast) {
+  // This test doesn't run if SVC is not supported.
+  if (!svc_support_)
+    return;
+
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
   cfg_.rc_target_bitrate = 500;
@@ -579,9 +590,13 @@ TEST_P(ErrorResilienceTestLargeCodecControls, CodecControl3TemporalLayers) {
   }
 }
 
-VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES);
+VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
+                          ::testing::Values(true));
 VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLargeCodecControls,
                           ONE_PASS_TEST_MODES);
-VP9_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES);
-VP10_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES);
+VP9_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
+                          ::testing::Values(true));
+// SVC-related tests don't run for VP10 since SVC is not supported.
+VP10_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
+                           ::testing::Values(false));
 }  // namespace
diff --git a/libvpx/test/frame_size_tests.cc b/libvpx/test/frame_size_tests.cc
index 95cc66ad..d39c8f6e 100644
--- a/libvpx/test/frame_size_tests.cc
+++ b/libvpx/test/frame_size_tests.cc
@@ -74,7 +74,7 @@ TEST_F(VP9FrameSizeTestsLarge, ValidSizes) {
   // size or almost 1 gig of memory.
   // In total the allocations will exceed 2GiB which may cause a failure with
   // mingw + wine, use a smaller size in that case.
-#if defined(_WIN32) && !defined(_WIN64)
+#if defined(_WIN32) && !defined(_WIN64) || defined(__OS2__)
   video.SetSize(4096, 3072);
 #else
   video.SetSize(4096, 4096);
diff --git a/libvpx/test/idct8x8_test.cc b/libvpx/test/idct8x8_test.cc
index 987ba753..7f9d751d 100644
--- a/libvpx/test/idct8x8_test.cc
+++ b/libvpx/test/idct8x8_test.cc
@@ -67,43 +67,6 @@ void reference_dct_2d(int16_t input[64], double output[64]) {
     output[i] *= 2;
 }
 
-void reference_idct_1d(double input[8], double output[8]) {
-  const double kPi = 3.141592653589793238462643383279502884;
-  const double kSqrt2 = 1.414213562373095048801688724209698;
-  for (int k = 0; k < 8; k++) {
-    output[k] = 0.0;
-    for (int n = 0; n < 8; n++) {
-      output[k] += input[n]*cos(kPi*(2*k+1)*n/16.0);
-      if (n == 0)
-        output[k] = output[k]/kSqrt2;
-    }
-  }
-}
-
-void reference_idct_2d(double input[64], int16_t output[64]) {
-  double out[64], out2[64];
-  // First transform rows
-  for (int i = 0; i < 8; ++i) {
-    double temp_in[8], temp_out[8];
-    for (int j = 0; j < 8; ++j)
-      temp_in[j] = input[j + i*8];
-    reference_idct_1d(temp_in, temp_out);
-    for (int j = 0; j < 8; ++j)
-      out[j + i*8] = temp_out[j];
-  }
-  // Then transform columns
-  for (int i = 0; i < 8; ++i) {
-    double temp_in[8], temp_out[8];
-    for (int j = 0; j < 8; ++j)
-      temp_in[j] = out[j*8 + i];
-    reference_idct_1d(temp_in, temp_out);
-    for (int j = 0; j < 8; ++j)
-      out2[j*8 + i] = temp_out[j];
-  }
-  for (int i = 0; i < 64; ++i)
-    output[i] = round(out2[i]/32);
-}
-
 TEST(VP9Idct8x8Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 10000;
diff --git a/libvpx/test/intrapred_test.cc b/libvpx/test/intrapred_test.cc
deleted file mode 100644
index 65a06974..00000000
--- a/libvpx/test/intrapred_test.cc
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <string.h>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vpx_config.h"
-#include "./vp8_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "vp8/common/blockd.h"
-#include "vpx_mem/vpx_mem.h"
-
-namespace {
-
-using libvpx_test::ACMRandom;
-
-class IntraPredBase {
- public:
-  virtual ~IntraPredBase() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void SetupMacroblock(MACROBLOCKD *mbptr,
-                       MODE_INFO *miptr,
-                       uint8_t *data,
-                       int block_size,
-                       int stride,
-                       int num_planes) {
-    mbptr_ = mbptr;
-    miptr_ = miptr;
-    mbptr_->up_available = 1;
-    mbptr_->left_available = 1;
-    mbptr_->mode_info_context = miptr_;
-    stride_ = stride;
-    block_size_ = block_size;
-    num_planes_ = num_planes;
-    for (int p = 0; p < num_planes; p++)
-      data_ptr_[p] = data + stride * (block_size + 1) * p +
-                     stride + block_size;
-  }
-
-  void FillRandom() {
-    // Fill edges with random data
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    for (int p = 0; p < num_planes_; p++) {
-      for (int x = -1 ; x <= block_size_; x++)
-        data_ptr_[p][x - stride_] = rnd.Rand8();
-      for (int y = 0; y < block_size_; y++)
-        data_ptr_[p][y * stride_ - 1] = rnd.Rand8();
-    }
-  }
-
-  virtual void Predict(MB_PREDICTION_MODE mode) = 0;
-
-  void SetLeftUnavailable() {
-    mbptr_->left_available = 0;
-    for (int p = 0; p < num_planes_; p++)
-      for (int i = -1; i < block_size_; ++i)
-        data_ptr_[p][stride_ * i - 1] = 129;
-  }
-
-  void SetTopUnavailable() {
-    mbptr_->up_available = 0;
-    for (int p = 0; p < num_planes_; p++)
-      memset(&data_ptr_[p][-1 - stride_], 127, block_size_ + 2);
-  }
-
-  void SetTopLeftUnavailable() {
-    SetLeftUnavailable();
-    SetTopUnavailable();
-  }
-
-  int BlockSizeLog2Min1() const {
-    switch (block_size_) {
-      case 16:
-        return 3;
-      case 8:
-        return 2;
-      default:
-        return 0;
-    }
-  }
-
-  // check DC prediction output against a reference
-  void CheckDCPrediction() const {
-    for (int p = 0; p < num_planes_; p++) {
-      // calculate expected DC
-      int expected;
-      if (mbptr_->up_available || mbptr_->left_available) {
-        int sum = 0, shift = BlockSizeLog2Min1() + mbptr_->up_available +
-                             mbptr_->left_available;
-        if (mbptr_->up_available)
-          for (int x = 0; x < block_size_; x++)
-            sum += data_ptr_[p][x - stride_];
-        if (mbptr_->left_available)
-          for (int y = 0; y < block_size_; y++)
-            sum += data_ptr_[p][y * stride_ - 1];
-        expected = (sum + (1 << (shift - 1))) >> shift;
-      } else {
-        expected = 0x80;
-      }
-      // check that all subsequent lines are equal to the first
-      for (int y = 1; y < block_size_; ++y)
-        ASSERT_EQ(0, memcmp(data_ptr_[p], &data_ptr_[p][y * stride_],
-                            block_size_));
-      // within the first line, ensure that each pixel has the same value
-      for (int x = 1; x < block_size_; ++x)
-        ASSERT_EQ(data_ptr_[p][0], data_ptr_[p][x]);
-      // now ensure that that pixel has the expected (DC) value
-      ASSERT_EQ(expected, data_ptr_[p][0]);
-    }
-  }
-
-  // check V prediction output against a reference
-  void CheckVPrediction() const {
-    // check that all lines equal the top border
-    for (int p = 0; p < num_planes_; p++)
-      for (int y = 0; y < block_size_; y++)
-        ASSERT_EQ(0, memcmp(&data_ptr_[p][-stride_],
-                            &data_ptr_[p][y * stride_], block_size_));
-  }
-
-  // check H prediction output against a reference
-  void CheckHPrediction() const {
-    // for each line, ensure that each pixel is equal to the left border
-    for (int p = 0; p < num_planes_; p++)
-      for (int y = 0; y < block_size_; y++)
-        for (int x = 0; x < block_size_; x++)
-          ASSERT_EQ(data_ptr_[p][-1 + y * stride_],
-                    data_ptr_[p][x + y * stride_]);
-  }
-
-  static int ClipByte(int value) {
-    if (value > 255)
-      return 255;
-    else if (value < 0)
-      return 0;
-    return value;
-  }
-
-  // check TM prediction output against a reference
-  void CheckTMPrediction() const {
-    for (int p = 0; p < num_planes_; p++)
-      for (int y = 0; y < block_size_; y++)
-        for (int x = 0; x < block_size_; x++) {
-          const int expected = ClipByte(data_ptr_[p][x - stride_]
-                                      + data_ptr_[p][stride_ * y - 1]
-                                      - data_ptr_[p][-1 - stride_]);
-          ASSERT_EQ(expected, data_ptr_[p][y * stride_ + x]);
-       }
-  }
-
-  // Actual test
-  void RunTest() {
-    {
-      SCOPED_TRACE("DC_PRED");
-      FillRandom();
-      Predict(DC_PRED);
-      CheckDCPrediction();
-    }
-    {
-      SCOPED_TRACE("DC_PRED LEFT");
-      FillRandom();
-      SetLeftUnavailable();
-      Predict(DC_PRED);
-      CheckDCPrediction();
-    }
-    {
-      SCOPED_TRACE("DC_PRED TOP");
-      FillRandom();
-      SetTopUnavailable();
-      Predict(DC_PRED);
-      CheckDCPrediction();
-    }
-    {
-      SCOPED_TRACE("DC_PRED TOP_LEFT");
-      FillRandom();
-      SetTopLeftUnavailable();
-      Predict(DC_PRED);
-      CheckDCPrediction();
-    }
-    {
-      SCOPED_TRACE("H_PRED");
-      FillRandom();
-      Predict(H_PRED);
-      CheckHPrediction();
-    }
-    {
-      SCOPED_TRACE("V_PRED");
-      FillRandom();
-      Predict(V_PRED);
-      CheckVPrediction();
-    }
-    {
-      SCOPED_TRACE("TM_PRED");
-      FillRandom();
-      Predict(TM_PRED);
-      CheckTMPrediction();
-    }
-  }
-
-  MACROBLOCKD *mbptr_;
-  MODE_INFO *miptr_;
-  uint8_t *data_ptr_[2];  // in the case of Y, only [0] is used
-  int stride_;
-  int block_size_;
-  int num_planes_;
-};
-
-typedef void (*IntraPredYFunc)(MACROBLOCKD *x,
-                               uint8_t *yabove_row,
-                               uint8_t *yleft,
-                               int left_stride,
-                               uint8_t *ypred_ptr,
-                               int y_stride);
-
-class IntraPredYTest
-    : public IntraPredBase,
-      public ::testing::TestWithParam<IntraPredYFunc> {
- public:
-  static void SetUpTestCase() {
-    mb_ = reinterpret_cast<MACROBLOCKD*>(
-        vpx_memalign(32, sizeof(MACROBLOCKD)));
-    mi_ = reinterpret_cast<MODE_INFO*>(
-        vpx_memalign(32, sizeof(MODE_INFO)));
-    data_array_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, kDataBufferSize));
-  }
-
-  static void TearDownTestCase() {
-    vpx_free(data_array_);
-    vpx_free(mi_);
-    vpx_free(mb_);
-    data_array_ = NULL;
-  }
-
- protected:
-  static const int kBlockSize = 16;
-  static const int kDataAlignment = 16;
-  static const int kStride = kBlockSize * 3;
-  // We use 48 so that the data pointer of the first pixel in each row of
-  // each macroblock is 16-byte aligned, and this gives us access to the
-  // top-left and top-right corner pixels belonging to the top-left/right
-  // macroblocks.
-  // We use 17 lines so we have one line above us for top-prediction.
-  static const int kDataBufferSize = kStride * (kBlockSize + 1);
-
-  virtual void SetUp() {
-    pred_fn_ = GetParam();
-    SetupMacroblock(mb_, mi_, data_array_, kBlockSize, kStride, 1);
-  }
-
-  virtual void Predict(MB_PREDICTION_MODE mode) {
-    mbptr_->mode_info_context->mbmi.mode = mode;
-    ASM_REGISTER_STATE_CHECK(pred_fn_(mbptr_,
-                                      data_ptr_[0] - kStride,
-                                      data_ptr_[0] - 1, kStride,
-                                      data_ptr_[0], kStride));
-  }
-
-  IntraPredYFunc pred_fn_;
-  static uint8_t* data_array_;
-  static MACROBLOCKD * mb_;
-  static MODE_INFO *mi_;
-};
-
-MACROBLOCKD* IntraPredYTest::mb_ = NULL;
-MODE_INFO* IntraPredYTest::mi_ = NULL;
-uint8_t* IntraPredYTest::data_array_ = NULL;
-
-TEST_P(IntraPredYTest, IntraPredTests) {
-  RunTest();
-}
-
-INSTANTIATE_TEST_CASE_P(C, IntraPredYTest,
-                        ::testing::Values(
-                            vp8_build_intra_predictors_mby_s_c));
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, IntraPredYTest,
-                        ::testing::Values(
-                            vp8_build_intra_predictors_mby_s_sse2));
-#endif
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredYTest,
-                        ::testing::Values(
-                            vp8_build_intra_predictors_mby_s_ssse3));
-#endif
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, IntraPredYTest,
-                        ::testing::Values(
-                            vp8_build_intra_predictors_mby_s_neon));
-#endif
-#if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, IntraPredYTest,
-                        ::testing::Values(
-                            vp8_build_intra_predictors_mby_s_msa));
-#endif
-
-typedef void (*IntraPredUvFunc)(MACROBLOCKD *x,
-                                uint8_t *uabove_row,
-                                uint8_t *vabove_row,
-                                uint8_t *uleft,
-                                uint8_t *vleft,
-                                int left_stride,
-                                uint8_t *upred_ptr,
-                                uint8_t *vpred_ptr,
-                                int pred_stride);
-
-class IntraPredUVTest
-    : public IntraPredBase,
-      public ::testing::TestWithParam<IntraPredUvFunc> {
- public:
-  static void SetUpTestCase() {
-    mb_ = reinterpret_cast<MACROBLOCKD*>(
-        vpx_memalign(32, sizeof(MACROBLOCKD)));
-    mi_ = reinterpret_cast<MODE_INFO*>(
-        vpx_memalign(32, sizeof(MODE_INFO)));
-    data_array_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, kDataBufferSize));
-  }
-
-  static void TearDownTestCase() {
-    vpx_free(data_array_);
-    vpx_free(mi_);
-    vpx_free(mb_);
-    data_array_ = NULL;
-  }
-
- protected:
-  static const int kBlockSize = 8;
-  static const int kDataAlignment = 8;
-  static const int kStride = kBlockSize * 3;
-  // We use 24 so that the data pointer of the first pixel in each row of
-  // each macroblock is 8-byte aligned, and this gives us access to the
-  // top-left and top-right corner pixels belonging to the top-left/right
-  // macroblocks.
-  // We use 9 lines so we have one line above us for top-prediction.
-  // [0] = U, [1] = V
-  static const int kDataBufferSize = 2 * kStride * (kBlockSize + 1);
-
-  virtual void SetUp() {
-    pred_fn_ = GetParam();
-    SetupMacroblock(mb_, mi_, data_array_, kBlockSize, kStride, 2);
-  }
-
-  virtual void Predict(MB_PREDICTION_MODE mode) {
-    mbptr_->mode_info_context->mbmi.uv_mode = mode;
-    pred_fn_(mbptr_, data_ptr_[0] - kStride, data_ptr_[1] - kStride,
-             data_ptr_[0] - 1, data_ptr_[1] - 1, kStride,
-             data_ptr_[0], data_ptr_[1], kStride);
-  }
-
-  IntraPredUvFunc pred_fn_;
-  // We use 24 so that the data pointer of the first pixel in each row of
-  // each macroblock is 8-byte aligned, and this gives us access to the
-  // top-left and top-right corner pixels belonging to the top-left/right
-  // macroblocks.
-  // We use 9 lines so we have one line above us for top-prediction.
-  // [0] = U, [1] = V
-  static uint8_t* data_array_;
-  static MACROBLOCKD* mb_;
-  static MODE_INFO* mi_;
-};
-
-MACROBLOCKD* IntraPredUVTest::mb_ = NULL;
-MODE_INFO* IntraPredUVTest::mi_ = NULL;
-uint8_t* IntraPredUVTest::data_array_ = NULL;
-
-TEST_P(IntraPredUVTest, IntraPredTests) {
-  RunTest();
-}
-
-INSTANTIATE_TEST_CASE_P(C, IntraPredUVTest,
-                        ::testing::Values(
-                            vp8_build_intra_predictors_mbuv_s_c));
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, IntraPredUVTest,
-                        ::testing::Values(
-                            vp8_build_intra_predictors_mbuv_s_sse2));
-#endif
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredUVTest,
-                        ::testing::Values(
-                            vp8_build_intra_predictors_mbuv_s_ssse3));
-#endif
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, IntraPredUVTest,
-                        ::testing::Values(
-                            vp8_build_intra_predictors_mbuv_s_neon));
-#endif
-#if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, IntraPredUVTest,
-                        ::testing::Values(
-                            vp8_build_intra_predictors_mbuv_s_msa));
-#endif
-
-}  // namespace
diff --git a/libvpx/test/invalid_file_test.cc b/libvpx/test/invalid_file_test.cc
index 1b5ef5c8..f4241eb8 100644
--- a/libvpx/test/invalid_file_test.cc
+++ b/libvpx/test/invalid_file_test.cc
@@ -63,9 +63,22 @@ class InvalidFileTest
     EXPECT_NE(res, EOF) << "Read result data failed";
 
     // Check results match.
-    EXPECT_EQ(expected_res_dec, res_dec)
-        << "Results don't match: frame number = " << video.frame_number()
-        << ". (" << decoder->DecodeError() << ")";
+    const DecodeParam input = GET_PARAM(1);
+    if (input.threads > 1) {
+      // The serial decode check is too strict for tile-threaded decoding as
+      // there is no guarantee on the decode order nor which specific error
+      // will take precedence. Currently a tile-level error is not forwarded so
+      // the frame will simply be marked corrupt.
+      EXPECT_TRUE(res_dec == expected_res_dec ||
+                  res_dec == VPX_CODEC_CORRUPT_FRAME)
+          << "Results don't match: frame number = " << video.frame_number()
+          << ". (" << decoder->DecodeError() << "). Expected: "
+          << expected_res_dec << " or " << VPX_CODEC_CORRUPT_FRAME;
+    } else {
+      EXPECT_EQ(expected_res_dec, res_dec)
+          << "Results don't match: frame number = " << video.frame_number()
+          << ". (" << decoder->DecodeError() << ")";
+    }
 
     return !HasFailure();
   }
@@ -145,7 +158,7 @@ TEST_P(InvalidFileInvalidPeekTest, ReturnCode) {
 }
 
 const DecodeParam kVP9InvalidFileInvalidPeekTests[] = {
-  {1, "invalid-vp90-01-v2.webm"},
+  {1, "invalid-vp90-01-v3.webm"},
 };
 
 VP9_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest,
diff --git a/libvpx/test/lpf_8_test.cc b/libvpx/test/lpf_8_test.cc
index 966e1095..0bf6b0c2 100644
--- a/libvpx/test/lpf_8_test.cc
+++ b/libvpx/test/lpf_8_test.cc
@@ -590,7 +590,9 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),
         make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1)));
+        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 8, 1)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
 
diff --git a/libvpx/test/register_state_check.h b/libvpx/test/register_state_check.h
index 8e72f911..489c4194 100644
--- a/libvpx/test/register_state_check.h
+++ b/libvpx/test/register_state_check.h
@@ -30,7 +30,9 @@
 
 #if defined(_WIN64)
 
-#define _WIN32_LEAN_AND_MEAN
+#undef NOMINMAX
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #include <winnt.h>
 
diff --git a/libvpx/test/resize_test.cc b/libvpx/test/resize_test.cc
index f1134aaf..98b6f87e 100644
--- a/libvpx/test/resize_test.cc
+++ b/libvpx/test/resize_test.cc
@@ -81,6 +81,15 @@ static void write_ivf_frame_header(const vpx_codec_cx_pkt_t *const pkt,
 const unsigned int kInitialWidth = 320;
 const unsigned int kInitialHeight = 240;
 
+struct FrameInfo {
+  FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)
+      : pts(_pts), w(_w), h(_h) {}
+
+  vpx_codec_pts_t pts;
+  unsigned int w;
+  unsigned int h;
+};
+
 unsigned int ScaleForFrameNumber(unsigned int frame, unsigned int val) {
   if (frame < 10)
     return val;
@@ -120,15 +129,6 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
 
   virtual ~ResizeTest() {}
 
-  struct FrameInfo {
-    FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)
-        : pts(_pts), w(_w), h(_h) {}
-
-    vpx_codec_pts_t pts;
-    unsigned int w;
-    unsigned int h;
-  };
-
   virtual void SetUp() {
     InitializeConfig();
     SetMode(GET_PARAM(1));
@@ -196,13 +196,27 @@ class ResizeInternalTest : public ResizeTest {
 
   virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                   libvpx_test::Encoder *encoder) {
-    if (video->frame() == kStepDownFrame) {
-      struct vpx_scaling_mode mode = {VP8E_FOURFIVE, VP8E_THREEFIVE};
-      encoder->Control(VP8E_SET_SCALEMODE, &mode);
-    }
-    if (video->frame() == kStepUpFrame) {
-      struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};
-      encoder->Control(VP8E_SET_SCALEMODE, &mode);
+    if (change_config_) {
+      int new_q = 60;
+      if (video->frame() == 0) {
+        struct vpx_scaling_mode mode = {VP8E_ONETWO, VP8E_ONETWO};
+        encoder->Control(VP8E_SET_SCALEMODE, &mode);
+      }
+      if (video->frame() == 1) {
+        struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};
+        encoder->Control(VP8E_SET_SCALEMODE, &mode);
+        cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = new_q;
+        encoder->Config(&cfg_);
+      }
+    } else {
+      if (video->frame() == kStepDownFrame) {
+        struct vpx_scaling_mode mode = {VP8E_FOURFIVE, VP8E_THREEFIVE};
+        encoder->Control(VP8E_SET_SCALEMODE, &mode);
+      }
+      if (video->frame() == kStepUpFrame) {
+        struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};
+        encoder->Control(VP8E_SET_SCALEMODE, &mode);
+      }
     }
   }
 
@@ -227,6 +241,7 @@ class ResizeInternalTest : public ResizeTest {
 #endif
 
   double frame0_psnr_;
+  bool change_config_;
 #if WRITE_COMPRESSED_STREAM
   FILE *outfile_;
   unsigned int out_frames_;
@@ -237,6 +252,7 @@ TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 10);
   init_flags_ = VPX_CODEC_USE_PSNR;
+  change_config_ = false;
 
   // q picked such that initial keyframe on this clip is ~30dB PSNR
   cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
@@ -261,6 +277,143 @@ TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
   }
 }
 
+TEST_P(ResizeInternalTest, TestInternalResizeChangeConfig) {
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 10);
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+  change_config_ = true;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+class ResizeInternalRealtimeTest : public ::libvpx_test::EncoderTest,
+  public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  ResizeInternalRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ResizeInternalRealtimeTest() {}
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_AQ_MODE, 3);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+    }
+
+    if (change_bitrate_ && video->frame() == 120) {
+      change_bitrate_ = false;
+      cfg_.rc_target_bitrate = 500;
+      encoder->Config(&cfg_);
+    }
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     vpx_codec_pts_t pts) {
+    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
+  }
+
+  void DefaultConfig() {
+    cfg_.g_w = 352;
+    cfg_.g_h = 288;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.kf_mode = VPX_KF_AUTO;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
+    // Enable dropped frames.
+    cfg_.rc_dropframe_thresh = 1;
+    // Enable error_resilience mode.
+    cfg_.g_error_resilient  = 1;
+    // Enable dynamic resizing.
+    cfg_.rc_resize_allowed = 1;
+    // Run at low bitrate.
+    cfg_.rc_target_bitrate = 200;
+  }
+
+  std::vector< FrameInfo > frame_info_list_;
+  int set_cpu_used_;
+  bool change_bitrate_;
+};
+
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
+// Run at low bitrate, with resize_allowed = 1, and verify that we get
+// one resize down event.
+TEST_P(ResizeInternalRealtimeTest, TestInternalResizeDown) {
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  DefaultConfig();
+  change_bitrate_ = false;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  unsigned int last_w = cfg_.g_w;
+  unsigned int last_h = cfg_.g_h;
+  int resize_count = 0;
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    if (info->w != last_w || info->h != last_h) {
+      // Verify that resize down occurs.
+      ASSERT_LT(info->w, last_w);
+      ASSERT_LT(info->h, last_h);
+      last_w = info->w;
+      last_h = info->h;
+      resize_count++;
+    }
+  }
+
+  // Verify that we get 1 resize down event in this test.
+  ASSERT_EQ(1, resize_count) << "Resizing should occur.";
+}
+
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
+// Start at low target bitrate, raise the bitrate in the middle of the clip,
+// scaling-up should occur after bitrate changed.
+TEST_P(ResizeInternalRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  DefaultConfig();
+  change_bitrate_ = true;
+  // Disable dropped frames.
+  cfg_.rc_dropframe_thresh = 0;
+  // Starting bitrate low.
+  cfg_.rc_target_bitrate = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  unsigned int last_w = cfg_.g_w;
+  unsigned int last_h = cfg_.g_h;
+  int resize_count = 0;
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    if (info->w != last_w || info->h != last_h) {
+      resize_count++;
+      if (resize_count == 1) {
+        // Verify that resize down occurs.
+        ASSERT_LT(info->w, last_w);
+        ASSERT_LT(info->h, last_h);
+      } else if (resize_count == 2) {
+        // Verify that resize up occurs.
+        ASSERT_GT(info->w, last_w);
+        ASSERT_GT(info->h, last_h);
+      }
+      last_w = info->w;
+      last_h = info->h;
+    }
+  }
+
+  // Verify that we get 2 resize events in this test.
+  ASSERT_EQ(2, resize_count) << "Resizing should occur twice.";
+}
+
 vpx_img_fmt_t CspForFrameNumber(int frame) {
   if (frame < 10)
     return VPX_IMG_FMT_I420;
@@ -371,6 +524,9 @@ VP9_INSTANTIATE_TEST_CASE(ResizeTest,
                           ::testing::Values(::libvpx_test::kRealTime));
 VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest,
                           ::testing::Values(::libvpx_test::kOnePassBest));
+VP9_INSTANTIATE_TEST_CASE(ResizeInternalRealtimeTest,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Range(5, 9));
 VP9_INSTANTIATE_TEST_CASE(ResizeCspTest,
                           ::testing::Values(::libvpx_test::kRealTime));
 }  // namespace
diff --git a/libvpx/test/sixtap_predict_test.cc b/libvpx/test/sixtap_predict_test.cc
index 8c7c98d8..1e682e7b 100644
--- a/libvpx/test/sixtap_predict_test.cc
+++ b/libvpx/test/sixtap_predict_test.cc
@@ -201,7 +201,7 @@ const SixtapPredictFunc sixtap_16x16_neon = vp8_sixtap_predict16x16_neon;
 const SixtapPredictFunc sixtap_8x8_neon = vp8_sixtap_predict8x8_neon;
 const SixtapPredictFunc sixtap_8x4_neon = vp8_sixtap_predict8x4_neon;
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_NEON, SixtapPredictTest, ::testing::Values(
+    NEON, SixtapPredictTest, ::testing::Values(
         make_tuple(16, 16, sixtap_16x16_neon),
         make_tuple(8, 8, sixtap_8x8_neon),
         make_tuple(8, 4, sixtap_8x4_neon)));
diff --git a/libvpx/test/superframe_test.cc b/libvpx/test/superframe_test.cc
index a8102b75..90aa75b4 100644
--- a/libvpx/test/superframe_test.cc
+++ b/libvpx/test/superframe_test.cc
@@ -16,8 +16,13 @@
 
 namespace {
 
+const int kTestMode = 0;
+const int kSuperframeSyntax = 1;
+
+typedef std::tr1::tuple<libvpx_test::TestMode,int> SuperframeTestParam;
+
 class SuperframeTest : public ::libvpx_test::EncoderTest,
-    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+    public ::libvpx_test::CodecTestWithParam<SuperframeTestParam> {
  protected:
   SuperframeTest() : EncoderTest(GET_PARAM(0)), modified_buf_(NULL),
       last_sf_pts_(0) {}
@@ -25,9 +30,13 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
 
   virtual void SetUp() {
     InitializeConfig();
-    SetMode(GET_PARAM(1));
+    const SuperframeTestParam input = GET_PARAM(1);
+    const libvpx_test::TestMode mode = std::tr1::get<kTestMode>(input);
+    const int syntax = std::tr1::get<kSuperframeSyntax>(input);
+    SetMode(mode);
     sf_count_ = 0;
     sf_count_max_ = INT_MAX;
+    is_vp10_style_superframe_ = syntax;
   }
 
   virtual void TearDown() {
@@ -50,7 +59,8 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
     const uint8_t marker = buffer[pkt->data.frame.sz - 1];
     const int frames = (marker & 0x7) + 1;
     const int mag = ((marker >> 3) & 3) + 1;
-    const unsigned int index_sz = 2 + mag  * frames;
+    const unsigned int index_sz =
+        2 + mag * (frames - is_vp10_style_superframe_);
     if ((marker & 0xe0) == 0xc0 &&
         pkt->data.frame.sz >= index_sz &&
         buffer[pkt->data.frame.sz - index_sz] == marker) {
@@ -75,6 +85,7 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
     return pkt;
   }
 
+  int is_vp10_style_superframe_;
   int sf_count_;
   int sf_count_max_;
   vpx_codec_cx_pkt_t modified_pkt_;
@@ -92,9 +103,11 @@ TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) {
   EXPECT_EQ(sf_count_, 1);
 }
 
-VP9_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Values(
-    ::libvpx_test::kTwoPassGood));
+VP9_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Combine(
+    ::testing::Values(::libvpx_test::kTwoPassGood),
+    ::testing::Values(0)));
 
-VP10_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Values(
-    ::libvpx_test::kTwoPassGood));
+VP10_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Combine(
+    ::testing::Values(::libvpx_test::kTwoPassGood),
+    ::testing::Values(CONFIG_MISC_FIXES)));
 }  // namespace
diff --git a/libvpx/test/test-data.mk b/libvpx/test/test-data.mk
index dda1c182..4280b35f 100644
--- a/libvpx/test/test-data.mk
+++ b/libvpx/test/test-data.mk
@@ -18,6 +18,7 @@ LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_440.yuv
 
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_credits.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
@@ -687,8 +688,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 # Invalid files for testing libvpx error checking.
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm
diff --git a/libvpx/test/test-data.sha1 b/libvpx/test/test-data.sha1
index 3590f4e3..4e4ac623 100644
--- a/libvpx/test/test-data.sha1
+++ b/libvpx/test/test-data.sha1
@@ -6,8 +6,8 @@ b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
 c123d1f9f02fb4143abb5e271916e3a3080de8f6 *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
-fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v2.webm
-25751f5d3b05ff03f0719ad42cd625348eb8961e *invalid-vp90-01-v2.webm.res
+fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v3.webm
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-01-v3.webm.res
 d78e2fceba5ac942246503ec8366f879c4775ca5 *invalid-vp90-02-v2.webm
 8e2eff4af87d2b561cce2365713269e301457ef3 *invalid-vp90-02-v2.webm.res
 df1a1453feb3c00d7d89746c7003b4163523bff3 *invalid-vp90-03-v3.webm
@@ -743,3 +743,4 @@ d06285d109ecbaef63b0cbcc44d70a129186f51c *invalid-vp90-2-03-size-224x196.webm.iv
 e60d859b0ef2b331b21740cf6cb83fabe469b079 *invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf
 0ae808dca4d3c1152a9576e14830b6faa39f1b4a *invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf.res
 9cfc855459e7549fd015c79e8eca512b2f2cb7e3 *niklas_1280_720_30.y4m
+5b5763b388b1b52a81bb82b39f7ec25c4bd3d0e1 *desktop_credits.y4m
diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk
index 6bb08bed..8d662448 100644
--- a/libvpx/test/test.mk
+++ b/libvpx/test/test.mk
@@ -36,6 +36,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += invalid_file_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
@@ -110,7 +111,6 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
 
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
-LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
 LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.cc
 
@@ -167,6 +167,10 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
 TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) += ../md5_utils.h ../md5_utils.c
 
+## VP10
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm_test.cc
+
 endif # CONFIG_SHARED
 
 include $(SRC_PATH_BARE)/test/test-data.mk
diff --git a/libvpx/test/test_libvpx.cc b/libvpx/test/test_libvpx.cc
index 26499174..005ea8d1 100644
--- a/libvpx/test/test_libvpx.cc
+++ b/libvpx/test/test_libvpx.cc
@@ -26,6 +26,7 @@ extern void vpx_dsp_rtcd();
 extern void vpx_scale_rtcd();
 }
 
+#if ARCH_X86 || ARCH_X86_64
 static void append_negative_gtest_filter(const char *str) {
   std::string filter = ::testing::FLAGS_gtest_filter;
   // Negative patterns begin with one '-' followed by a ':' separated list.
@@ -33,6 +34,7 @@ static void append_negative_gtest_filter(const char *str) {
   filter += str;
   ::testing::FLAGS_gtest_filter = filter;
 }
+#endif  // ARCH_X86 || ARCH_X86_64
 
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
@@ -55,7 +57,7 @@ int main(int argc, char **argv) {
     append_negative_gtest_filter(":AVX.*:AVX/*");
   if (!(simd_caps & HAS_AVX2))
     append_negative_gtest_filter(":AVX2.*:AVX2/*");
-#endif
+#endif  // ARCH_X86 || ARCH_X86_64
 
 #if !CONFIG_SHARED
 // Shared library builds don't support whitebox tests
diff --git a/libvpx/test/util.h b/libvpx/test/util.h
index 3c45721f..b27bffa9 100644
--- a/libvpx/test/util.h
+++ b/libvpx/test/util.h
@@ -19,8 +19,7 @@
 // Macros
 #define GET_PARAM(k) std::tr1::get< k >(GetParam())
 
-static double compute_psnr(const vpx_image_t *img1,
-                           const vpx_image_t *img2) {
+inline double compute_psnr(const vpx_image_t *img1, const vpx_image_t *img2) {
   assert((img1->fmt == img2->fmt) &&
          (img1->d_w == img2->d_w) &&
          (img1->d_h == img2->d_h));
diff --git a/libvpx/test/video_source.h b/libvpx/test/video_source.h
index 63294d14..ade323e7 100644
--- a/libvpx/test/video_source.h
+++ b/libvpx/test/video_source.h
@@ -11,6 +11,9 @@
 #define TEST_VIDEO_SOURCE_H_
 
 #if defined(_WIN32)
+#undef NOMINMAX
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #endif
 #include <cstdio>
@@ -48,7 +51,7 @@ static std::string GetDataPath() {
 #undef TO_STRING
 #undef STRINGIFY
 
-static FILE *OpenTestDataFile(const std::string& file_name) {
+inline FILE *OpenTestDataFile(const std::string& file_name) {
   const std::string path_to_source = GetDataPath() + "/" + file_name;
   return fopen(path_to_source.c_str(), "rb");
 }
diff --git a/libvpx/test/vp10_dct_test.cc b/libvpx/test/vp10_dct_test.cc
new file mode 100644
index 00000000..b2c301ae
--- /dev/null
+++ b/libvpx/test/vp10_dct_test.cc
@@ -0,0 +1,111 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <new>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "./vpx_config.h"
+#include "vpx_ports/msvc.h"
+
+#undef CONFIG_COEFFICIENT_RANGE_CHECKING
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 1
+#include "vp10/encoder/dct.c"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+void reference_dct_1d(const double *in, double *out, int size) {
+  const double PI = 3.141592653589793238462643383279502884;
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * cos(PI * (2 * n + 1) * k / (2 * size));
+    }
+    if (k == 0)
+      out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+typedef void (*FdctFuncRef)(const double *in, double *out, int size);
+typedef void (*IdctFuncRef)(const double *in, double *out, int size);
+typedef void (*FdctFunc)(const tran_low_t *in, tran_low_t *out);
+typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
+
+class TransTestBase {
+ public:
+  virtual ~TransTestBase() {}
+
+ protected:
+  void RunFwdAccuracyCheck() {
+    tran_low_t *input  = new tran_low_t[txfm_size_];
+    tran_low_t *output = new tran_low_t[txfm_size_];
+    double *ref_input  = new double[txfm_size_];
+    double *ref_output = new double[txfm_size_];
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    for (int ti =  0; ti < count_test_block; ++ti) {
+      for (int ni = 0; ni < txfm_size_; ++ni) {
+        input[ni] = rnd.Rand8() - rnd.Rand8();
+        ref_input[ni] = static_cast<double>(input[ni]);
+      }
+
+      fwd_txfm_(input, output);
+      fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
+
+      for (int ni = 0; ni < txfm_size_; ++ni) {
+        EXPECT_LE(
+            abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
+            max_error_);
+      }
+    }
+
+    delete[] input;
+    delete[] output;
+    delete[] ref_input;
+    delete[] ref_output;
+  }
+
+  double max_error_;
+  int txfm_size_;
+  FdctFunc fwd_txfm_;
+  FdctFuncRef fwd_txfm_ref_;
+};
+
+typedef std::tr1::tuple<FdctFunc, FdctFuncRef, int, int> FdctParam;
+class Vp10FwdTxfm
+    : public TransTestBase,
+      public ::testing::TestWithParam<FdctParam> {
+ public:
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = GET_PARAM(1);
+    txfm_size_ = GET_PARAM(2);
+    max_error_ = GET_PARAM(3);
+  }
+  virtual void TearDown() {}
+};
+
+TEST_P(Vp10FwdTxfm, RunFwdAccuracyCheck) {
+  RunFwdAccuracyCheck();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, Vp10FwdTxfm,
+    ::testing::Values(
+        FdctParam(&fdct4, &reference_dct_1d, 4, 1),
+        FdctParam(&fdct8, &reference_dct_1d, 8, 1),
+        FdctParam(&fdct16, &reference_dct_1d, 16, 2)));
+}  // namespace
diff --git a/libvpx/test/vp10_inv_txfm_test.cc b/libvpx/test/vp10_inv_txfm_test.cc
new file mode 100644
index 00000000..c49081ef
--- /dev/null
+++ b/libvpx/test/vp10_inv_txfm_test.cc
@@ -0,0 +1,321 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp10/common/blockd.h"
+#include "vp10/common/scan.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/vp10_inv_txfm.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+const double PI = 3.141592653589793238462643383279502884;
+const double kInvSqrt2 = 0.707106781186547524400844362104;
+
+void reference_idct_1d(const double *in, double *out, int size) {
+  for (int n = 0; n < size; ++n) {
+    out[n] = 0;
+    for (int k = 0; k < size; ++k) {
+      if (k == 0)
+        out[n] += kInvSqrt2 * in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
+      else
+        out[n] += in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
+    }
+  }
+}
+
+typedef void (*IdctFuncRef)(const double *in, double *out, int size);
+typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
+
+class TransTestBase {
+ public:
+  virtual ~TransTestBase() {}
+
+ protected:
+  void RunInvAccuracyCheck() {
+    tran_low_t *input  = new tran_low_t[txfm_size_];
+    tran_low_t *output = new tran_low_t[txfm_size_];
+    double *ref_input  = new double[txfm_size_];
+    double *ref_output = new double[txfm_size_];
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    for (int ti =  0; ti < count_test_block; ++ti) {
+      for (int ni = 0; ni < txfm_size_; ++ni) {
+        input[ni] = rnd.Rand8() - rnd.Rand8();
+        ref_input[ni] = static_cast<double>(input[ni]);
+      }
+
+      fwd_txfm_(input, output);
+      fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
+
+      for (int ni = 0; ni < txfm_size_; ++ni) {
+        EXPECT_LE(
+            abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
+            max_error_);
+      }
+    }
+
+    delete[] input;
+    delete[] output;
+    delete[] ref_input;
+    delete[] ref_output;
+  }
+
+  double max_error_;
+  int txfm_size_;
+  IdctFunc fwd_txfm_;
+  IdctFuncRef fwd_txfm_ref_;
+};
+
+typedef std::tr1::tuple<IdctFunc, IdctFuncRef, int, int> IdctParam;
+class Vp10InvTxfm
+    : public TransTestBase,
+      public ::testing::TestWithParam<IdctParam> {
+ public:
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = GET_PARAM(1);
+    txfm_size_ = GET_PARAM(2);
+    max_error_ = GET_PARAM(3);
+  }
+  virtual void TearDown() {}
+};
+
+TEST_P(Vp10InvTxfm, RunInvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, Vp10InvTxfm,
+    ::testing::Values(
+        IdctParam(&vp10_idct4_c, &reference_idct_1d, 4, 1),
+        IdctParam(&vp10_idct8_c, &reference_idct_1d, 8, 2),
+        IdctParam(&vp10_idct16_c, &reference_idct_1d, 16, 4),
+        IdctParam(&vp10_idct32_c, &reference_idct_1d, 32, 6))
+);
+
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef std::tr1::tuple<FwdTxfmFunc,
+                        InvTxfmFunc,
+                        InvTxfmFunc,
+                        TX_SIZE, int> PartialInvTxfmParam;
+const int kMaxNumCoeffs = 1024;
+class Vp10PartialIDctTest
+    : public ::testing::TestWithParam<PartialInvTxfmParam> {
+ public:
+  virtual ~Vp10PartialIDctTest() {}
+  virtual void SetUp() {
+    ftxfm_ = GET_PARAM(0);
+    full_itxfm_ = GET_PARAM(1);
+    partial_itxfm_ = GET_PARAM(2);
+    tx_size_  = GET_PARAM(3);
+    last_nonzero_ = GET_PARAM(4);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int last_nonzero_;
+  TX_SIZE tx_size_;
+  FwdTxfmFunc ftxfm_;
+  InvTxfmFunc full_itxfm_;
+  InvTxfmFunc partial_itxfm_;
+};
+
+TEST_P(Vp10PartialIDctTest, RunQuantCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int size;
+  switch (tx_size_) {
+    case TX_4X4:
+      size = 4;
+      break;
+    case TX_8X8:
+      size = 8;
+      break;
+    case TX_16X16:
+      size = 16;
+      break;
+    case TX_32X32:
+      size = 32;
+      break;
+    default:
+      FAIL() << "Wrong Size!";
+      break;
+  }
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
+
+  const int count_test_block = 1000;
+  const int block_size = size * size;
+
+  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
+
+  int max_error = 0;
+  for (int i = 0; i < count_test_block; ++i) {
+    // clear out destination buffer
+    memset(dst1, 0, sizeof(*dst1) * block_size);
+    memset(dst2, 0, sizeof(*dst2) * block_size);
+    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      if (i == 0) {
+        for (int j = 0; j < block_size; ++j)
+          input_extreme_block[j] = 255;
+      } else if (i == 1) {
+        for (int j = 0; j < block_size; ++j)
+          input_extreme_block[j] = -255;
+      } else {
+        for (int j = 0; j < block_size; ++j) {
+          input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        }
+      }
+
+      ftxfm_(input_extreme_block, output_ref_block, size);
+
+      // quantization with maximum allowed step sizes
+      test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
+      for (int j = 1; j < last_nonzero_; ++j)
+        test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]]
+                         = (output_ref_block[j] / 1828) * 1828;
+    }
+
+    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
+
+    for (int j = 0; j < block_size; ++j) {
+      const int diff = dst1[j] - dst2[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+    }
+  }
+
+  EXPECT_EQ(0, max_error)
+      << "Error: partial inverse transform produces different results";
+}
+
+TEST_P(Vp10PartialIDctTest, ResultsMatch) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int size;
+  switch (tx_size_) {
+    case TX_4X4:
+      size = 4;
+      break;
+    case TX_8X8:
+      size = 8;
+      break;
+    case TX_16X16:
+      size = 16;
+      break;
+    case TX_32X32:
+      size = 32;
+      break;
+    default:
+      FAIL() << "Wrong Size!";
+      break;
+  }
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
+  const int count_test_block = 1000;
+  const int max_coeff = 32766 / 4;
+  const int block_size = size * size;
+  int max_error = 0;
+  for (int i = 0; i < count_test_block; ++i) {
+    // clear out destination buffer
+    memset(dst1, 0, sizeof(*dst1) * block_size);
+    memset(dst2, 0, sizeof(*dst2) * block_size);
+    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+    int max_energy_leftover = max_coeff * max_coeff;
+    for (int j = 0; j < last_nonzero_; ++j) {
+      int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
+                                          (rnd.Rand16() - 32768) / 65536);
+      max_energy_leftover -= coef * coef;
+      if (max_energy_leftover < 0) {
+        max_energy_leftover = 0;
+        coef = 0;
+      }
+      test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]] = coef;
+    }
+
+    memcpy(test_coef_block2, test_coef_block1,
+           sizeof(*test_coef_block2) * block_size);
+
+    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
+
+    for (int j = 0; j < block_size; ++j) {
+      const int diff = dst1[j] - dst2[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+    }
+  }
+
+  EXPECT_EQ(0, max_error)
+      << "Error: partial inverse transform produces different results";
+}
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, Vp10PartialIDctTest,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_c,
+                   &vp10_idct32x32_1024_add_c,
+                   &vp10_idct32x32_34_add_c,
+                   TX_32X32, 34),
+        make_tuple(&vpx_fdct32x32_c,
+                   &vp10_idct32x32_1024_add_c,
+                   &vp10_idct32x32_1_add_c,
+                   TX_32X32, 1),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vp10_idct16x16_256_add_c,
+                   &vp10_idct16x16_10_add_c,
+                   TX_16X16, 10),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vp10_idct16x16_256_add_c,
+                   &vp10_idct16x16_1_add_c,
+                   TX_16X16, 1),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vp10_idct8x8_64_add_c,
+                   &vp10_idct8x8_12_add_c,
+                   TX_8X8, 12),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vp10_idct8x8_64_add_c,
+                   &vp10_idct8x8_1_add_c,
+                   TX_8X8, 1),
+        make_tuple(&vpx_fdct4x4_c,
+                   &vp10_idct4x4_16_add_c,
+                   &vp10_idct4x4_1_add_c,
+                   TX_4X4, 1)));
+}  // namespace
diff --git a/libvpx/test/vp9_arf_freq_test.cc b/libvpx/test/vp9_arf_freq_test.cc
index 87ff15b6..89200d40 100644
--- a/libvpx/test/vp9_arf_freq_test.cc
+++ b/libvpx/test/vp9_arf_freq_test.cc
@@ -230,9 +230,23 @@ VP9_INSTANTIATE_TEST_CASE(
     ::testing::ValuesIn(kEncodeVectors),
     ::testing::ValuesIn(kMinArfVectors));
 
+#if CONFIG_VP9_HIGHBITDEPTH
+# if CONFIG_VP10_ENCODER
+// TODO(angiebird): 25-29 fail in high bitdepth mode.
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_VP10, ArfFreqTest,
+    ::testing::Combine(
+        ::testing::Values(static_cast<const libvpx_test::CodecFactory *>(
+            &libvpx_test::kVP10)),
+        ::testing::ValuesIn(kTestVectors),
+        ::testing::ValuesIn(kEncodeVectors),
+        ::testing::ValuesIn(kMinArfVectors)));
+# endif  // CONFIG_VP10_ENCODER
+#else
 VP10_INSTANTIATE_TEST_CASE(
     ArfFreqTest,
     ::testing::ValuesIn(kTestVectors),
     ::testing::ValuesIn(kEncodeVectors),
     ::testing::ValuesIn(kMinArfVectors));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/libvpx/test/vp9_encoder_parms_get_to_decoder.cc b/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
index a02070e4..3ef6022a 100644
--- a/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
@@ -14,38 +14,10 @@
 #include "test/encode_test_driver.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
-#include "test/yuv_video_source.h"
-#include "vp9/decoder/vp9_decoder.h"
-
-typedef vpx_codec_stream_info_t vp9_stream_info_t;
-struct vpx_codec_alg_priv {
-  vpx_codec_priv_t        base;
-  vpx_codec_dec_cfg_t     cfg;
-  vp9_stream_info_t       si;
-  struct VP9Decoder      *pbi;
-  int                     postproc_cfg_set;
-  vp8_postproc_cfg_t      postproc_cfg;
-  vpx_decrypt_cb          decrypt_cb;
-  void                   *decrypt_state;
-  vpx_image_t             img;
-  int                     img_avail;
-  int                     flushed;
-  int                     invert_tile_order;
-  int                     frame_parallel_decode;
-
-  // External frame buffer info to save for VP9 common.
-  void *ext_priv;  // Private data associated with the external frame buffers.
-  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
-  vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
-};
-
-static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
-  return (vpx_codec_alg_priv_t *)ctx->priv;
-}
+#include "vp9/vp9_dx_iface.h"
 
 namespace {
 
-const unsigned int kFramerate = 50;
 const int kCpuUsed = 2;
 
 struct EncodePerfTestVideo {
@@ -66,35 +38,27 @@ struct EncodeParameters {
   int32_t lossless;
   int32_t error_resilient;
   int32_t frame_parallel;
+  vpx_color_range_t color_range;
   vpx_color_space_t cs;
+  int render_size[2];
   // TODO(JBB): quantizers / bitrate
 };
 
 const EncodeParameters kVP9EncodeParameterSet[] = {
-    {0, 0, 0, 1, 0, VPX_CS_BT_601},
-    {0, 0, 0, 0, 0, VPX_CS_BT_709},
-    {0, 0, 1, 0, 0, VPX_CS_BT_2020},
-    {0, 2, 0, 0, 1, VPX_CS_UNKNOWN},
-    // TODO(JBB): Test profiles (requires more work).
+  {0, 0, 0, 1, 0, VPX_CR_STUDIO_RANGE, VPX_CS_BT_601},
+  {0, 0, 0, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_709},
+  {0, 0, 1, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_2020},
+  {0, 2, 0, 0, 1, VPX_CR_STUDIO_RANGE, VPX_CS_UNKNOWN, { 640, 480 }},
+  // TODO(JBB): Test profiles (requires more work).
 };
 
-int is_extension_y4m(const char *filename) {
-  const char *dot = strrchr(filename, '.');
-  if (!dot || dot == filename)
-    return 0;
-  else
-    return !strcmp(dot, ".y4m");
-}
-
 class VpxEncoderParmsGetToDecoder
     : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<EncodeParameters, \
+      public ::libvpx_test::CodecTestWith2Params<EncodeParameters,
                                                  EncodePerfTestVideo> {
  protected:
   VpxEncoderParmsGetToDecoder()
-      : EncoderTest(GET_PARAM(0)),
-        encode_parms(GET_PARAM(1)) {
-  }
+      : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
 
   virtual ~VpxEncoderParmsGetToDecoder() {}
 
@@ -112,6 +76,7 @@ class VpxEncoderParmsGetToDecoder
                                   ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 1) {
       encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs);
+      encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range);
       encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless);
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
                        encode_parms.frame_parallel);
@@ -122,37 +87,44 @@ class VpxEncoderParmsGetToDecoder
       encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
       encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
       encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0)
+        encoder->Control(VP9E_SET_RENDER_SIZE, encode_parms.render_size);
     }
   }
 
   virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource& video,
+                                  const libvpx_test::VideoSource &video,
                                   libvpx_test::Decoder *decoder) {
-    vpx_codec_ctx_t* vp9_decoder = decoder->GetDecoder();
-    vpx_codec_alg_priv_t* priv =
-        (vpx_codec_alg_priv_t*) get_alg_priv(vp9_decoder);
-
-    VP9Decoder* pbi = priv->pbi;
-    VP9_COMMON* common = &pbi->common;
+    vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
+    vpx_codec_alg_priv_t *const priv =
+        reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv);
+    FrameWorkerData *const worker_data =
+        reinterpret_cast<FrameWorkerData *>(priv->frame_workers[0].data1);
+    VP9_COMMON *const common = &worker_data->pbi->common;
 
     if (encode_parms.lossless) {
-      EXPECT_EQ(common->base_qindex, 0);
-      EXPECT_EQ(common->y_dc_delta_q, 0);
-      EXPECT_EQ(common->uv_dc_delta_q, 0);
-      EXPECT_EQ(common->uv_ac_delta_q, 0);
-      EXPECT_EQ(common->tx_mode, ONLY_4X4);
+      EXPECT_EQ(0, common->base_qindex);
+      EXPECT_EQ(0, common->y_dc_delta_q);
+      EXPECT_EQ(0, common->uv_dc_delta_q);
+      EXPECT_EQ(0, common->uv_ac_delta_q);
+      EXPECT_EQ(ONLY_4X4, common->tx_mode);
     }
-    EXPECT_EQ(common->error_resilient_mode, encode_parms.error_resilient);
+    EXPECT_EQ(encode_parms.error_resilient, common->error_resilient_mode);
     if (encode_parms.error_resilient) {
-      EXPECT_EQ(common->frame_parallel_decoding_mode, 1);
-      EXPECT_EQ(common->use_prev_frame_mvs, 0);
+      EXPECT_EQ(1, common->frame_parallel_decoding_mode);
+      EXPECT_EQ(0, common->use_prev_frame_mvs);
     } else {
-      EXPECT_EQ(common->frame_parallel_decoding_mode,
-                encode_parms.frame_parallel);
+      EXPECT_EQ(encode_parms.frame_parallel,
+                common->frame_parallel_decoding_mode);
+    }
+    EXPECT_EQ(encode_parms.color_range, common->color_range);
+    EXPECT_EQ(encode_parms.cs, common->color_space);
+    if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
+      EXPECT_EQ(encode_parms.render_size[0], common->render_width);
+      EXPECT_EQ(encode_parms.render_size[1], common->render_height);
     }
-    EXPECT_EQ(common->color_space, encode_parms.cs);
-    EXPECT_EQ(common->log2_tile_cols, encode_parms.tile_cols);
-    EXPECT_EQ(common->log2_tile_rows, encode_parms.tile_rows);
+    EXPECT_EQ(encode_parms.tile_cols, common->log2_tile_cols);
+    EXPECT_EQ(encode_parms.tile_rows, common->log2_tile_rows);
 
     EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
     return VPX_CODEC_OK == res_dec;
@@ -164,35 +136,18 @@ class VpxEncoderParmsGetToDecoder
   EncodeParameters encode_parms;
 };
 
-// TODO(hkuang): This test conflicts with frame parallel decode. So disable it
-// for now until fix.
-TEST_P(VpxEncoderParmsGetToDecoder, DISABLED_BitstreamParms) {
+TEST_P(VpxEncoderParmsGetToDecoder, BitstreamParms) {
   init_flags_ = VPX_CODEC_USE_PSNR;
 
-  libvpx_test::VideoSource *video;
-  if (is_extension_y4m(test_video_.name)) {
-    video = new libvpx_test::Y4mVideoSource(test_video_.name,
-                                            0, test_video_.frames);
-  } else {
-    video = new libvpx_test::YUVVideoSource(test_video_.name,
-                                            VPX_IMG_FMT_I420,
-                                            test_video_.width,
-                                            test_video_.height,
-                                            kFramerate, 1, 0,
-                                            test_video_.frames);
-  }
+  libvpx_test::VideoSource *const video =
+      new libvpx_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames);
+  ASSERT_TRUE(video != NULL);
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(video));
-  delete(video);
+  delete video;
 }
 
-VP9_INSTANTIATE_TEST_CASE(
-    VpxEncoderParmsGetToDecoder,
-    ::testing::ValuesIn(kVP9EncodeParameterSet),
-    ::testing::ValuesIn(kVP9EncodePerfTestVectors));
-
-VP10_INSTANTIATE_TEST_CASE(
-    VpxEncoderParmsGetToDecoder,
-    ::testing::ValuesIn(kVP9EncodeParameterSet),
-    ::testing::ValuesIn(kVP9EncodePerfTestVectors));
+VP9_INSTANTIATE_TEST_CASE(VpxEncoderParmsGetToDecoder,
+                          ::testing::ValuesIn(kVP9EncodeParameterSet),
+                          ::testing::ValuesIn(kVP9EncodePerfTestVectors));
 }  // namespace
diff --git a/libvpx/test/vp9_end_to_end_test.cc b/libvpx/test/vp9_end_to_end_test.cc
index e100eb95..be1fa68c 100644
--- a/libvpx/test/vp9_end_to_end_test.cc
+++ b/libvpx/test/vp9_end_to_end_test.cc
@@ -187,9 +187,23 @@ VP9_INSTANTIATE_TEST_CASE(
     ::testing::ValuesIn(kTestVectors),
     ::testing::ValuesIn(kCpuUsedVectors));
 
+#if CONFIG_VP9_HIGHBITDEPTH
+# if CONFIG_VP10_ENCODER
+// TODO(angiebird): many fail in high bitdepth mode.
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_VP10, EndToEndTestLarge,
+    ::testing::Combine(
+        ::testing::Values(static_cast<const libvpx_test::CodecFactory *>(
+            &libvpx_test::kVP10)),
+        ::testing::ValuesIn(kEncodingModeVectors),
+        ::testing::ValuesIn(kTestVectors),
+        ::testing::ValuesIn(kCpuUsedVectors)));
+# endif  // CONFIG_VP10_ENCODER
+#else
 VP10_INSTANTIATE_TEST_CASE(
     EndToEndTestLarge,
     ::testing::ValuesIn(kEncodingModeVectors),
     ::testing::ValuesIn(kTestVectors),
     ::testing::ValuesIn(kCpuUsedVectors));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/libvpx/test/vp9_error_block_test.cc b/libvpx/test/vp9_error_block_test.cc
index 8c5d5a2e..77b12ea8 100644
--- a/libvpx/test/vp9_error_block_test.cc
+++ b/libvpx/test/vp9_error_block_test.cc
@@ -67,12 +67,22 @@ TEST_P(ErrorBlockTest, OperationCheck) {
   int64_t ret;
   int64_t ref_ssz;
   int64_t ref_ret;
+  const int msb = bit_depth_ + 8 - 1;
   for (int i = 0; i < kNumIterations; ++i) {
     int err_count = 0;
     block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
     for (int j = 0; j < block_size; j++) {
-      coeff[j]   = rnd(2 << 20) - (1 << 20);
-      dqcoeff[j] = rnd(2 << 20) - (1 << 20);
+      // coeff and dqcoeff will always have at least the same sign, and this
+      // can be used for optimization, so generate test input precisely.
+      if (rnd(2)) {
+        // Positive number
+        coeff[j]   = rnd(1 << msb);
+        dqcoeff[j] = rnd(1 << msb);
+      } else {
+        // Negative number
+        coeff[j]   = -rnd(1 << msb);
+        dqcoeff[j] = -rnd(1 << msb);
+      }
     }
     ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
                                   bit_depth_);
@@ -85,7 +95,7 @@ TEST_P(ErrorBlockTest, OperationCheck) {
     err_count_total += err_count;
   }
   EXPECT_EQ(0, err_count_total)
-      << "Error: Error Block Test, C output doesn't match SSE2 output. "
+      << "Error: Error Block Test, C output doesn't match optimized output. "
       << "First failed at test case " << first_failure;
 }
 
@@ -100,23 +110,36 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
   int64_t ret;
   int64_t ref_ssz;
   int64_t ref_ret;
-  int max_val = ((1 << 20) - 1);
+  const int msb = bit_depth_ + 8 - 1;
+  int max_val = ((1 << msb) - 1);
   for (int i = 0; i < kNumIterations; ++i) {
     int err_count = 0;
-    int k = (i / 9) % 5;
+    int k = (i / 9) % 9;
 
     // Change the maximum coeff value, to test different bit boundaries
-    if ( k == 4 && (i % 9) == 0 ) {
+    if ( k == 8 && (i % 9) == 0 ) {
       max_val >>= 1;
     }
     block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
     for (int j = 0; j < block_size; j++) {
-      if (k < 4) {  // Test at maximum values
-        coeff[j]   = k % 2 ? max_val : -max_val;
-        dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
+      if (k < 4) {
+        // Test at positive maximum values
+        coeff[j]   = k % 2 ? max_val : 0;
+        dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;
+      } else if (k < 8) {
+        // Test at negative maximum values
+        coeff[j]   = k % 2 ? -max_val : 0;
+        dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;
       } else {
-        coeff[j]   = rnd(2 << 14) - (1 << 14);
-        dqcoeff[j] = rnd(2 << 14) - (1 << 14);
+        if (rnd(2)) {
+          // Positive number
+          coeff[j]   = rnd(1 << 14);
+          dqcoeff[j] = rnd(1 << 14);
+        } else {
+          // Negative number
+          coeff[j]   = -rnd(1 << 14);
+          dqcoeff[j] = -rnd(1 << 14);
+        }
       }
     }
     ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
@@ -130,13 +153,30 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
     err_count_total += err_count;
   }
   EXPECT_EQ(0, err_count_total)
-      << "Error: Error Block Test, C output doesn't match SSE2 output. "
+      << "Error: Error Block Test, C output doesn't match optimized output. "
       << "First failed at test case " << first_failure;
 }
 
 using std::tr1::make_tuple;
 
+#if CONFIG_USE_X86INC
+int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
+                                           const tran_low_t *dqcoeff,
+                                           intptr_t block_size,
+                                           int64_t *ssz, int bps) {
+  assert(bps == 8);
+  return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
+}
+
 #if HAVE_SSE2
+int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
+                                              const tran_low_t *dqcoeff,
+                                              intptr_t block_size,
+                                              int64_t *ssz, int bps) {
+  assert(bps == 8);
+  return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
+}
+
 INSTANTIATE_TEST_CASE_P(
     SSE2, ErrorBlockTest,
     ::testing::Values(
@@ -145,7 +185,27 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_highbd_block_error_sse2,
                    &vp9_highbd_block_error_c, VPX_BITS_12),
         make_tuple(&vp9_highbd_block_error_sse2,
-                   &vp9_highbd_block_error_c, VPX_BITS_8)));
+                   &vp9_highbd_block_error_c, VPX_BITS_8),
+        make_tuple(&wrap_vp9_highbd_block_error_8bit_sse2,
+                   &wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
 #endif  // HAVE_SSE2
+
+#if HAVE_AVX
+int64_t wrap_vp9_highbd_block_error_8bit_avx(const tran_low_t *coeff,
+                                              const tran_low_t *dqcoeff,
+                                              intptr_t block_size,
+                                              int64_t *ssz, int bps) {
+  assert(bps == 8);
+  return vp9_highbd_block_error_8bit_avx(coeff, dqcoeff, block_size, ssz);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AVX, ErrorBlockTest,
+    ::testing::Values(
+        make_tuple(&wrap_vp9_highbd_block_error_8bit_avx,
+                   &wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
+#endif  // HAVE_AVX
+
+#endif  // CONFIG_USE_X86INC
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/libvpx/test/vp9_thread_test.cc b/libvpx/test/vp9_thread_test.cc
index 233e1b1a..92e4b968 100644
--- a/libvpx/test/vp9_thread_test.cc
+++ b/libvpx/test/vp9_thread_test.cc
@@ -190,7 +190,7 @@ string DecodeFile(const string& filename, int num_threads) {
 void DecodeFiles(const FileList files[]) {
   for (const FileList *iter = files; iter->name != NULL; ++iter) {
     SCOPED_TRACE(iter->name);
-    for (int t = 2; t <= 8; ++t) {
+    for (int t = 1; t <= 8; ++t) {
       EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t))
           << "threads = " << t;
     }
@@ -235,13 +235,13 @@ TEST(VPxWorkerThreadTest, TestSerialInterface) {
   EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
 }
 
-TEST(VP9DecodeMultiThreadedTest, Decode) {
+TEST(VP9DecodeMultiThreadedTest, NoTilesNonFrameParallel) {
   // no tiles or frame parallel; this exercises loop filter threading.
   EXPECT_EQ("b35a1b707b28e82be025d960aba039bc",
             DecodeFile("vp90-2-03-size-226x226.webm", 2));
 }
 
-TEST(VP9DecodeMultiThreadedTest, Decode2) {
+TEST(VP9DecodeMultiThreadedTest, FrameParallel) {
   static const FileList files[] = {
     { "vp90-2-08-tile_1x2_frame_parallel.webm",
       "68ede6abd66bae0a2edf2eb9232241b6" },
@@ -255,8 +255,7 @@ TEST(VP9DecodeMultiThreadedTest, Decode2) {
   DecodeFiles(files);
 }
 
-// Test tile quantity changes within one file.
-TEST(VP9DecodeMultiThreadedTest, Decode3) {
+TEST(VP9DecodeMultiThreadedTest, FrameParallelResize) {
   static const FileList files[] = {
     { "vp90-2-14-resize-fp-tiles-1-16.webm",
       "0cd5e632c326297e975f38949c31ea94" },
@@ -307,6 +306,19 @@ TEST(VP9DecodeMultiThreadedTest, Decode3) {
 
   DecodeFiles(files);
 }
+
+TEST(VP9DecodeMultiThreadedTest, NonFrameParallel) {
+  static const FileList files[] = {
+    { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" },
+    { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" },
+    { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" },
+    { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" },
+    { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" },
+    { NULL, NULL }
+  };
+
+  DecodeFiles(files);
+}
 #endif  // CONFIG_WEBM_IO
 
 INSTANTIATE_TEST_CASE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool());
diff --git a/libvpx/test/y4m_video_source.h b/libvpx/test/y4m_video_source.h
index 378e75bf..03d9388d 100644
--- a/libvpx/test/y4m_video_source.h
+++ b/libvpx/test/y4m_video_source.h
@@ -9,6 +9,7 @@
  */
 #ifndef TEST_Y4M_VIDEO_SOURCE_H_
 #define TEST_Y4M_VIDEO_SOURCE_H_
+#include <algorithm>
 #include <string>
 
 #include "test/video_source.h"
@@ -91,6 +92,18 @@ class Y4mVideoSource : public VideoSource {
     y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
   }
 
+  // Swap buffers with another y4m source. This allows reading a new frame
+  // while keeping the old frame around. A whole Y4mSource is required and
+  // not just a vpx_image_t because of how the y4m reader manipulates
+  // vpx_image_t internals,
+  void SwapBuffers(Y4mVideoSource *other) {
+    std::swap(other->y4m_.dst_buf, y4m_.dst_buf);
+    vpx_image_t *tmp;
+    tmp = other->img_.release();
+    other->img_.reset(img_.release());
+    img_.reset(tmp);
+  }
+
  protected:
   void CloseSource() {
     y4m_input_close(&y4m_);
diff --git a/libvpx/third_party/libwebm/README.libvpx b/libvpx/third_party/libwebm/README.libvpx
index 91875e11..2989d3d8 100644
--- a/libvpx/third_party/libwebm/README.libvpx
+++ b/libvpx/third_party/libwebm/README.libvpx
@@ -1,7 +1,10 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 2dec09426ab62b794464cc9971bd135b4d313e65
+Version: 476366249e1fda7710a389cd41c57db42305e0d4
 License: BSD
 License File: LICENSE.txt
 
 Description:
 libwebm is used to handle WebM container I/O.
+
+Local Changes:
+* <none>
diff --git a/libvpx/third_party/libwebm/mkvmuxer.hpp b/libvpx/third_party/libwebm/mkvmuxer.hpp
index 497ad4cf..03a002c9 100644
--- a/libvpx/third_party/libwebm/mkvmuxer.hpp
+++ b/libvpx/third_party/libwebm/mkvmuxer.hpp
@@ -528,7 +528,7 @@ class Tracks {
  public:
   // Audio and video type defined by the Matroska specs.
   enum { kVideo = 0x1, kAudio = 0x2 };
-  // Opus, Vorbis, VP8, and VP9 codec ids defined by the Matroska specs.
+
   static const char kOpusCodecId[];
   static const char kVorbisCodecId[];
   static const char kVp8CodecId[];
diff --git a/libvpx/third_party/libwebm/mkvparser.cpp b/libvpx/third_party/libwebm/mkvparser.cpp
index fc01be52..f2855d50 100644
--- a/libvpx/third_party/libwebm/mkvparser.cpp
+++ b/libvpx/third_party/libwebm/mkvparser.cpp
@@ -7,45 +7,66 @@
 // be found in the AUTHORS file in the root of the source tree.
 
 #include "mkvparser.hpp"
+
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#include <float.h>  // _isnan() / _finite()
+#define MSC_COMPAT
+#endif
+
 #include <cassert>
+#include <climits>
+#include <cmath>
 #include <cstring>
 #include <new>
-#include <climits>
+
+#include "webmids.hpp"
 
 #ifdef _MSC_VER
 // Disable MSVC warnings that suggest making code non-portable.
 #pragma warning(disable : 4996)
 #endif
 
-mkvparser::IMkvReader::~IMkvReader() {}
+namespace mkvparser {
+
+#ifdef MSC_COMPAT
+inline bool isnan(double val) { return !!_isnan(val); }
+inline bool isinf(double val) { return !_finite(val); }
+#else
+inline bool isnan(double val) { return std::isnan(val); }
+inline bool isinf(double val) { return std::isinf(val); }
+#endif  // MSC_COMPAT
+
+IMkvReader::~IMkvReader() {}
 
-void mkvparser::GetVersion(int& major, int& minor, int& build, int& revision) {
+template<typename Type> Type* SafeArrayAlloc(unsigned long long num_elements,
+                                             unsigned long long element_size) {
+  if (num_elements == 0 || element_size == 0)
+    return NULL;
+
+  const size_t kMaxAllocSize = 0x80000000;  // 2GiB
+  const unsigned long long num_bytes = num_elements * element_size;
+  if (element_size > (kMaxAllocSize / num_elements))
+    return NULL;
+  if (num_bytes != static_cast<size_t>(num_bytes))
+    return NULL;
+
+  return new (std::nothrow) Type[static_cast<size_t>(num_bytes)];
+}
+
+void GetVersion(int& major, int& minor, int& build, int& revision) {
   major = 1;
   minor = 0;
   build = 0;
   revision = 30;
 }
 
-long long mkvparser::ReadUInt(IMkvReader* pReader, long long pos, long& len) {
-  assert(pReader);
-  assert(pos >= 0);
-
-  int status;
-
-  //#ifdef _DEBUG
-  //    long long total, available;
-  //    status = pReader->Length(&total, &available);
-  //    assert(status >= 0);
-  //    assert((total < 0) || (available <= total));
-  //    assert(pos < available);
-  //    assert((available - pos) >= 1);  //assume here max u-int len is 8
-  //#endif
+long long ReadUInt(IMkvReader* pReader, long long pos, long& len) {
+  if (!pReader || pos < 0)
+    return E_FILE_FORMAT_INVALID;
 
   len = 1;
-
   unsigned char b;
-
-  status = pReader->Read(pos, 1, &b);
+  int status = pReader->Read(pos, 1, &b);
 
   if (status < 0)  // error or underflow
     return status;
@@ -63,10 +84,6 @@ long long mkvparser::ReadUInt(IMkvReader* pReader, long long pos, long& len) {
     ++len;
   }
 
-  //#ifdef _DEBUG
-  //    assert((available - pos) >= len);
-  //#endif
-
   long long result = b & (~m);
   ++pos;
 
@@ -92,16 +109,76 @@ long long mkvparser::ReadUInt(IMkvReader* pReader, long long pos, long& len) {
   return result;
 }
 
-long long mkvparser::GetUIntLength(IMkvReader* pReader, long long pos,
-                                   long& len) {
-  assert(pReader);
-  assert(pos >= 0);
+// Reads an EBML ID and returns it.
+// An ID must at least 1 byte long, cannot exceed 4, and its value must be
+// greater than 0.
+// See known EBML values and EBMLMaxIDLength:
+// http://www.matroska.org/technical/specs/index.html
+// Returns the ID, or a value less than 0 to report an error while reading the
+// ID.
+long long ReadID(IMkvReader* pReader, long long pos, long& len) {
+  if (pReader == NULL || pos < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  // Read the first byte. The length in bytes of the ID is determined by
+  // finding the first set bit in the first byte of the ID.
+  unsigned char temp_byte = 0;
+  int read_status = pReader->Read(pos, 1, &temp_byte);
+
+  if (read_status < 0)
+    return E_FILE_FORMAT_INVALID;
+  else if (read_status > 0)  // No data to read.
+    return E_BUFFER_NOT_FULL;
+
+  if (temp_byte == 0)  // ID length > 8 bytes; invalid file.
+    return E_FILE_FORMAT_INVALID;
+
+  int bit_pos = 0;
+  const int kMaxIdLengthInBytes = 4;
+  const int kCheckByte = 0x80;
+
+  // Find the first bit that's set.
+  bool found_bit = false;
+  for (; bit_pos < kMaxIdLengthInBytes; ++bit_pos) {
+    if ((kCheckByte >> bit_pos) & temp_byte) {
+      found_bit = true;
+      break;
+    }
+  }
+
+  if (!found_bit) {
+    // The value is too large to be a valid ID.
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  // Read the remaining bytes of the ID (if any).
+  const int id_length = bit_pos + 1;
+  long long ebml_id = temp_byte;
+  for (int i = 1; i < id_length; ++i) {
+    ebml_id <<= 8;
+    read_status = pReader->Read(pos + i, 1, &temp_byte);
+
+    if (read_status < 0)
+      return E_FILE_FORMAT_INVALID;
+    else if (read_status > 0)
+      return E_BUFFER_NOT_FULL;
+
+    ebml_id |= temp_byte;
+  }
+
+  len = id_length;
+  return ebml_id;
+}
+
+long long GetUIntLength(IMkvReader* pReader, long long pos, long& len) {
+  if (!pReader || pos < 0)
+    return E_FILE_FORMAT_INVALID;
 
   long long total, available;
 
   int status = pReader->Length(&total, &available);
-  assert(status >= 0);
-  assert((total < 0) || (available <= total));
+  if (status < 0 || (total >= 0 && available > total))
+    return E_FILE_FORMAT_INVALID;
 
   len = 1;
 
@@ -112,11 +189,9 @@ long long mkvparser::GetUIntLength(IMkvReader* pReader, long long pos,
 
   status = pReader->Read(pos, 1, &b);
 
-  if (status < 0)
+  if (status != 0)
     return status;
 
-  assert(status == 0);
-
   if (b == 0)  // we can't handle u-int values larger than 8 bytes
     return E_FILE_FORMAT_INVALID;
 
@@ -132,12 +207,8 @@ long long mkvparser::GetUIntLength(IMkvReader* pReader, long long pos,
 
 // TODO(vigneshv): This function assumes that unsigned values never have their
 // high bit set.
-long long mkvparser::UnserializeUInt(IMkvReader* pReader, long long pos,
-                                     long long size) {
-  assert(pReader);
-  assert(pos >= 0);
-
-  if ((size <= 0) || (size > 8))
+long long UnserializeUInt(IMkvReader* pReader, long long pos, long long size) {
+  if (!pReader || pos < 0 || (size <= 0) || (size > 8))
     return E_FILE_FORMAT_INVALID;
 
   long long result = 0;
@@ -159,12 +230,9 @@ long long mkvparser::UnserializeUInt(IMkvReader* pReader, long long pos,
   return result;
 }
 
-long mkvparser::UnserializeFloat(IMkvReader* pReader, long long pos,
-                                 long long size_, double& result) {
-  assert(pReader);
-  assert(pos >= 0);
-
-  if ((size_ != 4) && (size_ != 8))
+long UnserializeFloat(IMkvReader* pReader, long long pos, long long size_,
+                      double& result) {
+  if (!pReader || pos < 0 || ((size_ != 4) && (size_ != 8)))
     return E_FILE_FORMAT_INVALID;
 
   const long size = static_cast<long>(size_);
@@ -195,8 +263,6 @@ long mkvparser::UnserializeFloat(IMkvReader* pReader, long long pos,
 
     result = f;
   } else {
-    assert(size == 8);
-
     union {
       double d;
       unsigned long long dd;
@@ -216,28 +282,25 @@ long mkvparser::UnserializeFloat(IMkvReader* pReader, long long pos,
     result = d;
   }
 
+  if (mkvparser::isinf(result) || mkvparser::isnan(result))
+    return E_FILE_FORMAT_INVALID;
+
   return 0;
 }
 
-long mkvparser::UnserializeInt(IMkvReader* pReader, long long pos,
-                               long long size, long long& result) {
-  assert(pReader);
-  assert(pos >= 0);
-  assert(size > 0);
-  assert(size <= 8);
-
-  {
-    signed char b;
-
-    const long status = pReader->Read(pos, 1, (unsigned char*)&b);
+long UnserializeInt(IMkvReader* pReader, long long pos, long long size,
+                    long long& result_ref) {
+  if (!pReader || pos < 0 || size < 1 || size > 8)
+    return E_FILE_FORMAT_INVALID;
 
-    if (status < 0)
-      return status;
+  signed char first_byte = 0;
+  const long status = pReader->Read(pos, 1, (unsigned char*)&first_byte);
 
-    result = b;
+  if (status < 0)
+    return status;
 
-    ++pos;
-  }
+  unsigned long long result = first_byte;
+  ++pos;
 
   for (long i = 1; i < size; ++i) {
     unsigned char b;
@@ -253,27 +316,28 @@ long mkvparser::UnserializeInt(IMkvReader* pReader, long long pos,
     ++pos;
   }
 
-  return 0;  // success
+  result_ref = static_cast<long long>(result);
+  return 0;
 }
 
-long mkvparser::UnserializeString(IMkvReader* pReader, long long pos,
-                                  long long size_, char*& str) {
+long UnserializeString(IMkvReader* pReader, long long pos, long long size,
+                       char*& str) {
   delete[] str;
   str = NULL;
 
-  if (size_ >= LONG_MAX)  // we need (size+1) chars
+  if (size >= LONG_MAX || size < 0)
     return E_FILE_FORMAT_INVALID;
 
-  const long size = static_cast<long>(size_);
-
-  str = new (std::nothrow) char[size + 1];
+  // +1 for '\0' terminator
+  const long required_size = static_cast<long>(size) + 1;
 
+  str = SafeArrayAlloc<char>(1, required_size);
   if (str == NULL)
-    return -1;
+    return E_FILE_FORMAT_INVALID;
 
   unsigned char* const buf = reinterpret_cast<unsigned char*>(str);
 
-  const long status = pReader->Read(pos, size, buf);
+  const long status = pReader->Read(pos, static_cast<long>(size), buf);
 
   if (status) {
     delete[] str;
@@ -282,137 +346,149 @@ long mkvparser::UnserializeString(IMkvReader* pReader, long long pos,
     return status;
   }
 
-  str[size] = '\0';
-
-  return 0;  // success
+  str[required_size - 1] = '\0';
+  return 0;
 }
 
-long mkvparser::ParseElementHeader(IMkvReader* pReader, long long& pos,
-                                   long long stop, long long& id,
-                                   long long& size) {
-  if ((stop >= 0) && (pos >= stop))
+long ParseElementHeader(IMkvReader* pReader, long long& pos,
+                        long long stop, long long& id,
+                        long long& size) {
+  if (stop >= 0 && pos >= stop)
     return E_FILE_FORMAT_INVALID;
 
   long len;
 
-  id = ReadUInt(pReader, pos, len);
+  id = ReadID(pReader, pos, len);
 
   if (id < 0)
     return E_FILE_FORMAT_INVALID;
 
   pos += len;  // consume id
 
-  if ((stop >= 0) && (pos >= stop))
+  if (stop >= 0 && pos >= stop)
     return E_FILE_FORMAT_INVALID;
 
   size = ReadUInt(pReader, pos, len);
 
-  if (size < 0)
+  if (size < 0 || len < 1 || len > 8) {
+    // Invalid: Negative payload size, negative or 0 length integer, or integer
+    // larger than 64 bits (libwebm cannot handle them).
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  // Avoid rolling over pos when very close to LLONG_MAX.
+  const unsigned long long rollover_check =
+      static_cast<unsigned long long>(pos) + len;
+  if (rollover_check > LLONG_MAX)
     return E_FILE_FORMAT_INVALID;
 
   pos += len;  // consume length of size
 
   // pos now designates payload
 
-  if ((stop >= 0) && ((pos + size) > stop))
+  if (stop >= 0 && pos >= stop)
     return E_FILE_FORMAT_INVALID;
 
   return 0;  // success
 }
 
-bool mkvparser::Match(IMkvReader* pReader, long long& pos, unsigned long id_,
-                      long long& val) {
-  assert(pReader);
-  assert(pos >= 0);
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+           long long& val) {
+  if (!pReader || pos < 0)
+    return false;
 
-  long long total, available;
+  long long total = 0;
+  long long available = 0;
 
   const long status = pReader->Length(&total, &available);
-  assert(status >= 0);
-  assert((total < 0) || (available <= total));
-  if (status < 0)
+  if (status < 0 || (total >= 0 && available > total))
     return false;
 
-  long len;
+  long len = 0;
 
-  const long long id = ReadUInt(pReader, pos, len);
-  assert(id >= 0);
-  assert(len > 0);
-  assert(len <= 8);
-  assert((pos + len) <= available);
+  const long long id = ReadID(pReader, pos, len);
+  if (id < 0 || (available - pos) > len)
+    return false;
 
-  if ((unsigned long)id != id_)
+  if (static_cast<unsigned long>(id) != expected_id)
     return false;
 
   pos += len;  // consume id
 
   const long long size = ReadUInt(pReader, pos, len);
-  assert(size >= 0);
-  assert(size <= 8);
-  assert(len > 0);
-  assert(len <= 8);
-  assert((pos + len) <= available);
+  if (size < 0 || size > 8 || len < 1 || len > 8 || (available - pos) > len)
+    return false;
 
   pos += len;  // consume length of size of payload
 
   val = UnserializeUInt(pReader, pos, size);
-  assert(val >= 0);
+  if (val < 0)
+    return false;
 
   pos += size;  // consume size of payload
 
   return true;
 }
 
-bool mkvparser::Match(IMkvReader* pReader, long long& pos, unsigned long id_,
-                      unsigned char*& buf, size_t& buflen) {
-  assert(pReader);
-  assert(pos >= 0);
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+           unsigned char*& buf, size_t& buflen) {
+  if (!pReader || pos < 0)
+    return false;
 
-  long long total, available;
+  long long total = 0;
+  long long available = 0;
 
   long status = pReader->Length(&total, &available);
-  assert(status >= 0);
-  assert((total < 0) || (available <= total));
-  if (status < 0)
+  if (status < 0 || (total >= 0 && available > total))
     return false;
 
-  long len;
-  const long long id = ReadUInt(pReader, pos, len);
-  assert(id >= 0);
-  assert(len > 0);
-  assert(len <= 8);
-  assert((pos + len) <= available);
+  long len = 0;
+  const long long id = ReadID(pReader, pos, len);
+  if (id < 0 || (available - pos) > len)
+    return false;
 
-  if ((unsigned long)id != id_)
+  if (static_cast<unsigned long>(id) != expected_id)
     return false;
 
   pos += len;  // consume id
 
-  const long long size_ = ReadUInt(pReader, pos, len);
-  assert(size_ >= 0);
-  assert(len > 0);
-  assert(len <= 8);
-  assert((pos + len) <= available);
+  const long long size = ReadUInt(pReader, pos, len);
+  if (size < 0 || len <= 0 || len > 8 || (available - pos) > len)
+    return false;
+
+  unsigned long long rollover_check =
+      static_cast<unsigned long long>(pos) + len;
+  if (rollover_check > LLONG_MAX)
+    return false;
 
   pos += len;  // consume length of size of payload
-  assert((pos + size_) <= available);
 
-  const long buflen_ = static_cast<long>(size_);
+  rollover_check = static_cast<unsigned long long>(pos) + size;
+  if (rollover_check > LLONG_MAX)
+    return false;
+
+  if ((pos + size) > available)
+    return false;
+
+  if (size >= LONG_MAX)
+    return false;
+
+  const long buflen_ = static_cast<long>(size);
 
-  buf = new (std::nothrow) unsigned char[buflen_];
-  assert(buf);  // TODO
+  buf = SafeArrayAlloc<unsigned char>(1, buflen_);
+  if (!buf)
+    return false;
 
   status = pReader->Read(pos, buflen_, buf);
-  assert(status == 0);  // TODO
+  if (status != 0)
+    return false;
 
   buflen = buflen_;
 
-  pos += size_;  // consume size of payload
+  pos += size;  // consume size of payload
   return true;
 }
 
-namespace mkvparser {
-
 EBMLHeader::EBMLHeader() : m_docType(NULL) { Init(); }
 
 EBMLHeader::~EBMLHeader() { delete[] m_docType; }
@@ -433,7 +509,8 @@ void EBMLHeader::Init() {
 }
 
 long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
-  assert(pReader);
+  if (!pReader)
+    return E_FILE_FORMAT_INVALID;
 
   long long total, available;
 
@@ -445,67 +522,45 @@ long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
   pos = 0;
   long long end = (available >= 1024) ? 1024 : available;
 
-  for (;;) {
-    unsigned char b = 0;
-
-    while (pos < end) {
-      status = pReader->Read(pos, 1, &b);
-
-      if (status < 0)  // error
-        return status;
-
-      if (b == 0x1A)
-        break;
-
-      ++pos;
-    }
+  // Scan until we find what looks like the first byte of the EBML header.
+  const long long kMaxScanBytes = (available >= 1024) ? 1024 : available;
+  const unsigned char kEbmlByte0 = 0x1A;
+  unsigned char scan_byte = 0;
 
-    if (b != 0x1A) {
-      if (pos >= 1024)
-        return E_FILE_FORMAT_INVALID;  // don't bother looking anymore
+  while (pos < kMaxScanBytes) {
+    status = pReader->Read(pos, 1, &scan_byte);
 
-      if ((total >= 0) && ((total - available) < 5))
-        return E_FILE_FORMAT_INVALID;
-
-      return available + 5;  // 5 = 4-byte ID + 1st byte of size
-    }
-
-    if ((total >= 0) && ((total - pos) < 5))
-      return E_FILE_FORMAT_INVALID;
-
-    if ((available - pos) < 5)
-      return pos + 5;  // try again later
-
-    long len;
-
-    const long long result = ReadUInt(pReader, pos, len);
-
-    if (result < 0)  // error
-      return result;
+    if (status < 0)  // error
+      return status;
+    else if (status > 0)
+      return E_BUFFER_NOT_FULL;
 
-    if (result == 0x0A45DFA3) {  // EBML Header ID
-      pos += len;  // consume ID
+    if (scan_byte == kEbmlByte0)
       break;
-    }
 
-    ++pos;  // throw away just the 0x1A byte, and try again
+    ++pos;
   }
 
-  // pos designates start of size field
+  long len = 0;
+  const long long ebml_id = ReadID(pReader, pos, len);
 
-  // get length of size field
+  // TODO(tomfinegan): Move Matroska ID constants into a common namespace.
+  if (len != 4 || ebml_id != mkvmuxer::kMkvEBML)
+    return E_FILE_FORMAT_INVALID;
 
-  long len;
+  // Move read pos forward to the EBML header size field.
+  pos += 4;
+
+  // Read length of size field.
   long long result = GetUIntLength(pReader, pos, len);
 
   if (result < 0)  // error
-    return result;
-
-  if (result > 0)  // need more data
-    return result;
+    return E_FILE_FORMAT_INVALID;
+  else if (result > 0)  // need more data
+    return E_BUFFER_NOT_FULL;
 
-  assert(len > 0);
-  assert(len <= 8);
+  if (len < 1 || len > 8)
+    return E_FILE_FORMAT_INVALID;
 
   if ((total >= 0) && ((total - pos) < len))
     return E_FILE_FORMAT_INVALID;
@@ -513,8 +568,7 @@ long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
   if ((available - pos) < len)
     return pos + len;  // try again later
 
-  // get the EBML header size
-
+  // Read the EBML header size.
   result = ReadUInt(pReader, pos, len);
 
   if (result < 0)  // error
@@ -542,30 +596,30 @@ long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
     if (status < 0)  // error
       return status;
 
-    if (size == 0)  // weird
+    if (size == 0)
       return E_FILE_FORMAT_INVALID;
 
-    if (id == 0x0286) {  // version
+    if (id == mkvmuxer::kMkvEBMLVersion) {
       m_version = UnserializeUInt(pReader, pos, size);
 
       if (m_version <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x02F7) {  // read version
+    } else if (id == mkvmuxer::kMkvEBMLReadVersion) {
       m_readVersion = UnserializeUInt(pReader, pos, size);
 
       if (m_readVersion <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x02F2) {  // max id length
+    } else if (id == mkvmuxer::kMkvEBMLMaxIDLength) {
       m_maxIdLength = UnserializeUInt(pReader, pos, size);
 
       if (m_maxIdLength <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x02F3) {  // max size length
+    } else if (id == mkvmuxer::kMkvEBMLMaxSizeLength) {
       m_maxSizeLength = UnserializeUInt(pReader, pos, size);
 
       if (m_maxSizeLength <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x0282) {  // doctype
+    } else if (id == mkvmuxer::kMkvDocType) {
       if (m_docType)
         return E_FILE_FORMAT_INVALID;
 
@@ -573,12 +627,12 @@ long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
 
       if (status)  // error
         return status;
-    } else if (id == 0x0287) {  // doctype version
+    } else if (id == mkvmuxer::kMkvDocTypeVersion) {
       m_docTypeVersion = UnserializeUInt(pReader, pos, size);
 
       if (m_docTypeVersion <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x0285) {  // doctype read version
+    } else if (id == mkvmuxer::kMkvDocTypeReadVersion) {
       m_docTypeReadVersion = UnserializeUInt(pReader, pos, size);
 
       if (m_docTypeReadVersion <= 0)
@@ -588,7 +642,18 @@ long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
     pos += size;
   }
 
-  assert(pos == end);
+  if (pos != end)
+    return E_FILE_FORMAT_INVALID;
+
+  // Make sure DocType, DocTypeReadVersion, and DocTypeVersion are valid.
+  if (m_docType == NULL || m_docTypeReadVersion <= 0 || m_docTypeVersion <= 0)
+    return E_FILE_FORMAT_INVALID;
+
+  // Make sure EBMLMaxIDLength and EBMLMaxSizeLength are valid.
+  if (m_maxIdLength <= 0 || m_maxIdLength > 4 ||
+      m_maxSizeLength <= 0 || m_maxSizeLength > 8)
+    return E_FILE_FORMAT_INVALID;
+
   return 0;
 }
 
@@ -621,8 +686,6 @@ Segment::~Segment() {
 
   while (i != j) {
     Cluster* const p = *i++;
-    assert(p);
-
     delete p;
   }
 
@@ -638,8 +701,8 @@ Segment::~Segment() {
 
 long long Segment::CreateInstance(IMkvReader* pReader, long long pos,
                                   Segment*& pSegment) {
-  assert(pReader);
-  assert(pos >= 0);
+  if (pReader == NULL || pos < 0)
+    return E_PARSE_FAILED;
 
   pSegment = NULL;
 
@@ -691,10 +754,10 @@ long long Segment::CreateInstance(IMkvReader* pReader, long long pos,
       return pos + len;
 
     const long long idpos = pos;
-    const long long id = ReadUInt(pReader, pos, len);
+    const long long id = ReadID(pReader, pos, len);
 
-    if (id < 0)  // error
-      return id;
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
 
     pos += len;  // consume ID
 
@@ -723,7 +786,7 @@ long long Segment::CreateInstance(IMkvReader* pReader, long long pos,
     // Handle "unknown size" for live streaming of webm files.
     const long long unknown_size = (1LL << (7 * len)) - 1;
 
-    if (id == 0x08538067) {  // Segment ID
+    if (id == mkvmuxer::kMkvSegment) {
       if (size == unknown_size)
         size = -1;
 
@@ -733,12 +796,9 @@ long long Segment::CreateInstance(IMkvReader* pReader, long long pos,
       else if ((pos + size) > total)
         size = -1;
 
-      pSegment = new (std::nothrow) Segment(pReader, idpos,
-                                            // elem_size
-                                            pos, size);
-
-      if (pSegment == 0)
-        return -1;  // generic error
+      pSegment = new (std::nothrow) Segment(pReader, idpos, pos, size);
+      if (pSegment == NULL)
+        return E_PARSE_FAILED;
 
       return 0;  // success
     }
@@ -767,11 +827,15 @@ long long Segment::ParseHeaders() {
   if (status < 0)  // error
     return status;
 
-  assert((total < 0) || (available <= total));
+  if (total > 0 && available > total)
+    return E_FILE_FORMAT_INVALID;
 
   const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
-  assert((segment_stop < 0) || (total < 0) || (segment_stop <= total));
-  assert((segment_stop < 0) || (m_pos <= segment_stop));
+
+  if ((segment_stop >= 0 && total >= 0 && segment_stop > total) ||
+      (segment_stop >= 0 && m_pos > segment_stop)) {
+    return E_FILE_FORMAT_INVALID;
+  }
 
   for (;;) {
     if ((total >= 0) && (m_pos >= total))
@@ -783,6 +847,11 @@ long long Segment::ParseHeaders() {
     long long pos = m_pos;
     const long long element_start = pos;
 
+    // Avoid rolling over pos when very close to LLONG_MAX.
+    unsigned long long rollover_check = pos + 1ULL;
+    if (rollover_check > LLONG_MAX)
+      return E_FILE_FORMAT_INVALID;
+
     if ((pos + 1) > available)
       return (pos + 1);
 
@@ -792,8 +861,10 @@ long long Segment::ParseHeaders() {
     if (result < 0)  // error
       return result;
 
-    if (result > 0)  // underflow (weird)
+    if (result > 0) {
+      // MkvReader doesn't have enough data to satisfy this read attempt.
       return (pos + 1);
+    }
 
     if ((segment_stop >= 0) && ((pos + len) > segment_stop))
       return E_FILE_FORMAT_INVALID;
@@ -802,12 +873,12 @@ long long Segment::ParseHeaders() {
       return pos + len;
 
     const long long idpos = pos;
-    const long long id = ReadUInt(m_pReader, idpos, len);
+    const long long id = ReadID(m_pReader, idpos, len);
 
-    if (id < 0)  // error
-      return id;
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
 
-    if (id == 0x0F43B675)  // Cluster ID
+    if (id == mkvmuxer::kMkvCluster)
       break;
 
     pos += len;  // consume ID
@@ -821,8 +892,10 @@ long long Segment::ParseHeaders() {
     if (result < 0)  // error
       return result;
 
-    if (result > 0)  // underflow (weird)
+    if (result > 0) {
+      // MkvReader doesn't have enough data to satisfy this read attempt.
       return (pos + 1);
+    }
 
     if ((segment_stop >= 0) && ((pos + len) > segment_stop))
       return E_FILE_FORMAT_INVALID;
@@ -832,11 +905,19 @@ long long Segment::ParseHeaders() {
 
     const long long size = ReadUInt(m_pReader, pos, len);
 
-    if (size < 0)  // error
+    if (size < 0 || len < 1 || len > 8) {
+      // TODO(tomfinegan): ReadUInt should return an error when len is < 1 or
+      // len > 8 is true instead of checking this _everywhere_.
       return size;
+    }
 
     pos += len;  // consume length of size of element
 
+    // Avoid rolling over pos when very close to LLONG_MAX.
+    rollover_check = static_cast<unsigned long long>(pos) + size;
+    if (rollover_check > LLONG_MAX)
+      return E_FILE_FORMAT_INVALID;
+
     const long long element_size = size + pos - element_start;
 
     // Pos now points to start of payload
@@ -849,7 +930,7 @@ long long Segment::ParseHeaders() {
     if ((pos + size) > available)
       return pos + size;
 
-    if (id == 0x0549A966) {  // Segment Info ID
+    if (id == mkvmuxer::kMkvInfo) {
       if (m_pInfo)
         return E_FILE_FORMAT_INVALID;
 
@@ -863,7 +944,7 @@ long long Segment::ParseHeaders() {
 
       if (status)
         return status;
-    } else if (id == 0x0654AE6B) {  // Tracks ID
+    } else if (id == mkvmuxer::kMkvTracks) {
       if (m_pTracks)
         return E_FILE_FORMAT_INVALID;
 
@@ -877,7 +958,7 @@ long long Segment::ParseHeaders() {
 
       if (status)
         return status;
-    } else if (id == 0x0C53BB6B) {  // Cues ID
+    } else if (id == mkvmuxer::kMkvCues) {
       if (m_pCues == NULL) {
         m_pCues = new (std::nothrow)
             Cues(this, pos, size, element_start, element_size);
@@ -885,7 +966,7 @@ long long Segment::ParseHeaders() {
         if (m_pCues == NULL)
           return -1;
       }
-    } else if (id == 0x014D9B74) {  // SeekHead ID
+    } else if (id == mkvmuxer::kMkvSeekHead) {
       if (m_pSeekHead == NULL) {
         m_pSeekHead = new (std::nothrow)
             SeekHead(this, pos, size, element_start, element_size);
@@ -898,7 +979,7 @@ long long Segment::ParseHeaders() {
         if (status)
           return status;
       }
-    } else if (id == 0x0043A770) {  // Chapters ID
+    } else if (id == mkvmuxer::kMkvChapters) {
       if (m_pChapters == NULL) {
         m_pChapters = new (std::nothrow)
             Chapters(this, pos, size, element_start, element_size);
@@ -911,7 +992,7 @@ long long Segment::ParseHeaders() {
         if (status)
           return status;
       }
-    } else if (id == 0x0254C367) {  // Tags ID
+    } else if (id == mkvmuxer::kMkvTags) {
       if (m_pTags == NULL) {
         m_pTags = new (std::nothrow)
             Tags(this, pos, size, element_start, element_size);
@@ -929,7 +1010,8 @@ long long Segment::ParseHeaders() {
     m_pos = pos + size;  // consume payload
   }
 
-  assert((segment_stop < 0) || (m_pos <= segment_stop));
+  if (segment_stop >= 0 && m_pos > segment_stop)
+    return E_FILE_FORMAT_INVALID;
 
   if (m_pInfo == NULL)  // TODO: liberalize this behavior
     return E_FILE_FORMAT_INVALID;
@@ -960,7 +1042,8 @@ long Segment::DoLoadCluster(long long& pos, long& len) {
   if (status < 0)  // error
     return status;
 
-  assert((total < 0) || (avail <= total));
+  if (total >= 0 && avail > total)
+    return E_FILE_FORMAT_INVALID;
 
   const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
 
@@ -988,7 +1071,7 @@ long Segment::DoLoadCluster(long long& pos, long& len) {
     if (result < 0)  // error
       return static_cast<long>(result);
 
-    if (result > 0)  // weird
+    if (result > 0)
       return E_BUFFER_NOT_FULL;
 
     if ((segment_stop >= 0) && ((pos + len) > segment_stop))
@@ -998,10 +1081,10 @@ long Segment::DoLoadCluster(long long& pos, long& len) {
       return E_BUFFER_NOT_FULL;
 
     const long long idpos = pos;
-    const long long id = ReadUInt(m_pReader, idpos, len);
+    const long long id = ReadID(m_pReader, idpos, len);
 
-    if (id < 0)  // error (or underflow)
-      return static_cast<long>(id);
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
 
     pos += len;  // consume ID
 
@@ -1017,7 +1100,7 @@ long Segment::DoLoadCluster(long long& pos, long& len) {
     if (result < 0)  // error
       return static_cast<long>(result);
 
-    if (result > 0)  // weird
+    if (result > 0)
       return E_BUFFER_NOT_FULL;
 
     if ((segment_stop >= 0) && ((pos + len) > segment_stop))
@@ -1035,7 +1118,8 @@ long Segment::DoLoadCluster(long long& pos, long& len) {
 
     // pos now points to start of payload
 
-    if (size == 0) {  // weird
+    if (size == 0) {
+      // Missing element payload: move on.
       m_pos = pos;
       continue;
     }
@@ -1047,24 +1131,30 @@ long Segment::DoLoadCluster(long long& pos, long& len) {
       return E_FILE_FORMAT_INVALID;
     }
 
-    if (id == 0x0C53BB6B) {  // Cues ID
-      if (size == unknown_size)
-        return E_FILE_FORMAT_INVALID;  // TODO: liberalize
+    if (id == mkvmuxer::kMkvCues) {
+      if (size == unknown_size) {
+        // Cues element of unknown size: Not supported.
+        return E_FILE_FORMAT_INVALID;
+      }
 
       if (m_pCues == NULL) {
         const long long element_size = (pos - idpos) + size;
 
-        m_pCues = new Cues(this, pos, size, idpos, element_size);
-        assert(m_pCues);  // TODO
+        m_pCues = new (std::nothrow) Cues(this, pos, size, idpos, element_size);
+        if (m_pCues == NULL)
+          return -1;
       }
 
       m_pos = pos + size;  // consume payload
       continue;
     }
 
-    if (id != 0x0F43B675) {  // Cluster ID
+    if (id != mkvmuxer::kMkvCluster) {
+      // Besides the Segment, Libwebm allows only cluster elements of unknown
+      // size. Fail the parse upon encountering a non-cluster element reporting
+      // unknown size.
       if (size == unknown_size)
-        return E_FILE_FORMAT_INVALID;  // TODO: liberalize
+        return E_FILE_FORMAT_INVALID;
 
       m_pos = pos + size;  // consume payload
       continue;
@@ -1080,7 +1170,10 @@ long Segment::DoLoadCluster(long long& pos, long& len) {
     break;
   }
 
-  assert(cluster_off >= 0);  // have cluster
+  if (cluster_off < 0) {
+    // No cluster, die.
+    return E_FILE_FORMAT_INVALID;
+  }
 
   long long pos_;
   long len_;
@@ -1126,14 +1219,16 @@ long Segment::DoLoadCluster(long long& pos, long& len) {
   const long idx = m_clusterCount;
 
   if (m_clusterPreloadCount > 0) {
-    assert(idx < m_clusterSize);
+    if (idx >= m_clusterSize)
+      return E_FILE_FORMAT_INVALID;
 
     Cluster* const pCluster = m_clusters[idx];
-    assert(pCluster);
-    assert(pCluster->m_index < 0);
+    if (pCluster == NULL || pCluster->m_index >= 0)
+      return E_FILE_FORMAT_INVALID;
 
     const long long off = pCluster->GetPosition();
-    assert(off >= 0);
+    if (off < 0)
+      return E_FILE_FORMAT_INVALID;
 
     if (off == cluster_off) {  // preloaded already
       if (status == 0)  // no entries found
@@ -1155,7 +1250,8 @@ long Segment::DoLoadCluster(long long& pos, long& len) {
       --m_clusterPreloadCount;
 
       m_pos = pos;  // consume payload
-      assert((segment_stop < 0) || (m_pos <= segment_stop));
+      if (segment_stop >= 0 && m_pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
 
       return 0;  // success
     }
@@ -1182,19 +1278,21 @@ long Segment::DoLoadCluster(long long& pos, long& len) {
   // status > 0 means we have an entry
 
   Cluster* const pCluster = Cluster::Create(this, idx, cluster_off);
-  // element_size);
-  assert(pCluster);
+  if (pCluster == NULL)
+    return -1;
 
-  AppendCluster(pCluster);
-  assert(m_clusters);
-  assert(idx < m_clusterSize);
-  assert(m_clusters[idx] == pCluster);
+  if (!AppendCluster(pCluster)) {
+    delete pCluster;
+    return -1;
+  }
 
   if (cluster_size >= 0) {
     pos += cluster_size;
 
     m_pos = pos;
-    assert((segment_stop < 0) || (m_pos <= segment_stop));
+
+    if (segment_stop > 0 && m_pos > segment_stop)
+      return E_FILE_FORMAT_INVALID;
 
     return 0;
   }
@@ -1210,8 +1308,8 @@ long Segment::DoLoadCluster(long long& pos, long& len) {
 }
 
 long Segment::DoLoadClusterUnknownSize(long long& pos, long& len) {
-  assert(m_pos < 0);
-  assert(m_pUnknownSize);
+  if (m_pos >= 0 || m_pUnknownSize == NULL)
+    return E_PARSE_FAILED;
 
   const long status = m_pUnknownSize->Parse(pos, len);
 
@@ -1221,12 +1319,11 @@ long Segment::DoLoadClusterUnknownSize(long long& pos, long& len) {
   if (status == 0)  // parsed a block
     return 2;  // continue parsing
 
-  assert(status > 0);  // nothing left to parse of this cluster
-
   const long long start = m_pUnknownSize->m_element_start;
-
   const long long size = m_pUnknownSize->GetElementSize();
-  assert(size >= 0);
+
+  if (size < 0)
+    return E_FILE_FORMAT_INVALID;
 
   pos = start + size;
   m_pos = pos;
@@ -1236,24 +1333,26 @@ long Segment::DoLoadClusterUnknownSize(long long& pos, long& len) {
   return 2;  // continue parsing
 }
 
-void Segment::AppendCluster(Cluster* pCluster) {
-  assert(pCluster);
-  assert(pCluster->m_index >= 0);
+bool Segment::AppendCluster(Cluster* pCluster) {
+  if (pCluster == NULL || pCluster->m_index < 0)
+    return false;
 
   const long count = m_clusterCount + m_clusterPreloadCount;
 
   long& size = m_clusterSize;
-  assert(size >= count);
-
   const long idx = pCluster->m_index;
-  assert(idx == m_clusterCount);
+
+  if (size < count || idx != m_clusterCount)
+    return false;
 
   if (count >= size) {
     const long n = (size <= 0) ? 2048 : 2 * size;
 
-    Cluster** const qq = new Cluster*[n];
-    Cluster** q = qq;
+    Cluster** const qq = new (std::nothrow) Cluster*[n];
+    if (qq == NULL)
+      return false;
 
+    Cluster** q = qq;
     Cluster** p = m_clusters;
     Cluster** const pp = p + count;
 
@@ -1267,18 +1366,18 @@ void Segment::AppendCluster(Cluster* pCluster) {
   }
 
   if (m_clusterPreloadCount > 0) {
-    assert(m_clusters);
-
     Cluster** const p = m_clusters + m_clusterCount;
-    assert(*p);
-    assert((*p)->m_index < 0);
+    if (*p == NULL || (*p)->m_index >= 0)
+      return false;
 
     Cluster** q = p + m_clusterPreloadCount;
-    assert(q < (m_clusters + size));
+    if (q >= (m_clusters + size))
+      return false;
 
     for (;;) {
       Cluster** const qq = q - 1;
-      assert((*qq)->m_index < 0);
+      if ((*qq)->m_index >= 0)
+        return false;
 
       *q = *qq;
       q = qq;
@@ -1290,22 +1389,25 @@ void Segment::AppendCluster(Cluster* pCluster) {
 
   m_clusters[idx] = pCluster;
   ++m_clusterCount;
+  return true;
 }
 
-void Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) {
-  assert(pCluster);
-  assert(pCluster->m_index < 0);
-  assert(idx >= m_clusterCount);
+bool Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) {
+  if (pCluster == NULL || pCluster->m_index >= 0 || idx < m_clusterCount)
+    return false;
 
   const long count = m_clusterCount + m_clusterPreloadCount;
 
   long& size = m_clusterSize;
-  assert(size >= count);
+  if (size < count)
+    return false;
 
   if (count >= size) {
     const long n = (size <= 0) ? 2048 : 2 * size;
 
-    Cluster** const qq = new Cluster*[n];
+    Cluster** const qq = new (std::nothrow) Cluster*[n];
+    if (qq == NULL)
+      return false;
     Cluster** q = qq;
 
     Cluster** p = m_clusters;
@@ -1320,17 +1422,20 @@ void Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) {
     size = n;
   }
 
-  assert(m_clusters);
+  if (m_clusters == NULL)
+    return false;
 
   Cluster** const p = m_clusters + idx;
 
   Cluster** q = m_clusters + count;
-  assert(q >= p);
-  assert(q < (m_clusters + size));
+  if (q < p || q >= (m_clusters + size))
+    return false;
 
   while (q > p) {
     Cluster** const qq = q - 1;
-    assert((*qq)->m_index < 0);
+
+    if ((*qq)->m_index >= 0)
+      return false;
 
     *q = *qq;
     q = qq;
@@ -1338,13 +1443,12 @@ void Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) {
 
   m_clusters[idx] = pCluster;
   ++m_clusterPreloadCount;
+  return true;
 }
 
 long Segment::Load() {
-  assert(m_clusters == NULL);
-  assert(m_clusterSize == 0);
-  assert(m_clusterCount == 0);
-  // assert(m_size >= 0);
+  if (m_clusters != NULL || m_clusterSize != 0 || m_clusterCount != 0)
+    return E_PARSE_FAILED;
 
   // Outermost (level 0) segment object has been constructed,
   // and pos designates start of payload.  We need to find the
@@ -1358,8 +1462,8 @@ long Segment::Load() {
   if (header_status > 0)  // underflow
     return E_BUFFER_NOT_FULL;
 
-  assert(m_pInfo);
-  assert(m_pTracks);
+  if (m_pInfo == NULL || m_pTracks == NULL)
+    return E_FILE_FORMAT_INVALID;
 
   for (;;) {
     const int status = LoadCluster();
@@ -1408,16 +1512,19 @@ long SeekHead::Parse() {
     if (status < 0)  // error
       return status;
 
-    if (id == 0x0DBB)  // SeekEntry ID
+    if (id == mkvmuxer::kMkvSeek)
       ++entry_count;
-    else if (id == 0x6C)  // Void ID
+    else if (id == mkvmuxer::kMkvVoid)
       ++void_element_count;
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
 
   m_entries = new (std::nothrow) Entry[entry_count];
 
@@ -1446,14 +1553,14 @@ long SeekHead::Parse() {
     if (status < 0)  // error
       return status;
 
-    if (id == 0x0DBB) {  // SeekEntry ID
+    if (id == mkvmuxer::kMkvSeek) {
       if (ParseEntry(pReader, pos, size, pEntry)) {
         Entry& e = *pEntry++;
 
         e.element_start = idpos;
         e.element_size = (pos + size) - idpos;
       }
-    } else if (id == 0x6C) {  // Void ID
+    } else if (id == mkvmuxer::kMkvVoid) {
       VoidElement& e = *pVoidElement++;
 
       e.element_start = idpos;
@@ -1461,10 +1568,12 @@ long SeekHead::Parse() {
     }
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
 
   ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
   assert(count_ >= 0);
@@ -1553,9 +1662,9 @@ long Segment::ParseCues(long long off, long long& pos, long& len) {
 
   const long long idpos = pos;
 
-  const long long id = ReadUInt(m_pReader, idpos, len);
+  const long long id = ReadID(m_pReader, idpos, len);
 
-  if (id != 0x0C53BB6B)  // Cues ID
+  if (id != mkvmuxer::kMkvCues)
     return E_FILE_FORMAT_INVALID;
 
   pos += len;  // consume ID
@@ -1615,7 +1724,8 @@ long Segment::ParseCues(long long off, long long& pos, long& len) {
 
   m_pCues =
       new (std::nothrow) Cues(this, pos, size, element_start, element_size);
-  assert(m_pCues);  // TODO
+  if (m_pCues == NULL)
+    return -1;
 
   return 0;  // success
 }
@@ -1632,10 +1742,11 @@ bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_,
 
   // parse the container for the level-1 element ID
 
-  const long long seekIdId = ReadUInt(pReader, pos, len);
-  // seekIdId;
+  const long long seekIdId = ReadID(pReader, pos, len);
+  if (seekIdId < 0)
+    return false;
 
-  if (seekIdId != 0x13AB)  // SeekID ID
+  if (seekIdId != mkvmuxer::kMkvSeekID)
     return false;
 
   if ((pos + len) > stop)
@@ -1677,9 +1788,9 @@ bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_,
 
   pos += seekIdSize;  // consume SeekID payload
 
-  const long long seekPosId = ReadUInt(pReader, pos, len);
+  const long long seekPosId = ReadID(pReader, pos, len);
 
-  if (seekPosId != 0x13AC)  // SeekPos ID
+  if (seekPosId != mkvmuxer::kMkvSeekPosition)
     return false;
 
   if ((pos + len) > stop)
@@ -1757,8 +1868,8 @@ bool Cues::Init() const {
   if (m_cue_points)
     return true;
 
-  assert(m_count == 0);
-  assert(m_preload_count == 0);
+  if (m_count != 0 || m_preload_count != 0)
+    return false;
 
   IMkvReader* const pReader = m_pSegment->m_pReader;
 
@@ -1772,7 +1883,7 @@ bool Cues::Init() const {
 
     long len;
 
-    const long long id = ReadUInt(pReader, pos, len);
+    const long long id = ReadID(pReader, pos, len);
     if (id < 0 || (pos + len) > stop) {
       return false;
     }
@@ -1789,21 +1900,27 @@ bool Cues::Init() const {
       return false;
     }
 
-    if (id == 0x3B)  // CuePoint ID
-      PreloadCuePoint(cue_points_size, idpos);
+    if (id == mkvmuxer::kMkvCuePoint) {
+      if (!PreloadCuePoint(cue_points_size, idpos))
+        return false;
+    }
 
     pos += size;  // skip payload
   }
   return true;
 }
 
-void Cues::PreloadCuePoint(long& cue_points_size, long long pos) const {
-  assert(m_count == 0);
+bool Cues::PreloadCuePoint(long& cue_points_size, long long pos) const {
+  if (m_count != 0)
+    return false;
 
   if (m_preload_count >= cue_points_size) {
     const long n = (cue_points_size <= 0) ? 2048 : 2 * cue_points_size;
 
-    CuePoint** const qq = new CuePoint*[n];
+    CuePoint** const qq = new (std::nothrow) CuePoint*[n];
+    if (qq == NULL)
+      return false;
+
     CuePoint** q = qq;  // beginning of target
 
     CuePoint** p = m_cue_points;  // beginning of source
@@ -1818,14 +1935,15 @@ void Cues::PreloadCuePoint(long& cue_points_size, long long pos) const {
     cue_points_size = n;
   }
 
-  CuePoint* const pCP = new CuePoint(m_preload_count, pos);
+  CuePoint* const pCP = new (std::nothrow) CuePoint(m_preload_count, pos);
+  if (pCP == NULL)
+    return false;
+
   m_cue_points[m_preload_count++] = pCP;
+  return true;
 }
 
 bool Cues::LoadCuePoint() const {
-  // odbgstream os;
-  // os << "Cues::LoadCuePoint" << endl;
-
   const long long stop = m_start + m_size;
 
   if (m_pos >= stop)
@@ -1843,32 +1961,33 @@ bool Cues::LoadCuePoint() const {
 
     long len;
 
-    const long long id = ReadUInt(pReader, m_pos, len);
-    assert(id >= 0);  // TODO
-    assert((m_pos + len) <= stop);
+    const long long id = ReadID(pReader, m_pos, len);
+    if (id < 0 || (m_pos + len) > stop)
+      return false;
 
     m_pos += len;  // consume ID
 
     const long long size = ReadUInt(pReader, m_pos, len);
-    assert(size >= 0);
-    assert((m_pos + len) <= stop);
+    if (size < 0 || (m_pos + len) > stop)
+      return false;
 
     m_pos += len;  // consume Size field
-    assert((m_pos + size) <= stop);
+    if ((m_pos + size) > stop)
+      return false;
 
-    if (id != 0x3B) {  // CuePoint ID
+    if (id != mkvmuxer::kMkvCuePoint) {
       m_pos += size;  // consume payload
-      assert(m_pos <= stop);
+      if (m_pos > stop)
+        return false;
 
       continue;
     }
 
-    assert(m_preload_count > 0);
+    if (m_preload_count < 1)
+      return false;
 
     CuePoint* const pCP = m_cue_points[m_count];
-    assert(pCP);
-    assert((pCP->GetTimeCode() >= 0) || (-pCP->GetTimeCode() == idpos));
-    if (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos))
+    if (!pCP || (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos)))
       return false;
 
     if (!pCP->Load(pReader)) {
@@ -1879,24 +1998,18 @@ bool Cues::LoadCuePoint() const {
     --m_preload_count;
 
     m_pos += size;  // consume payload
-    assert(m_pos <= stop);
+    if (m_pos > stop)
+      return false;
 
     return true;  // yes, we loaded a cue point
   }
 
-  // return (m_pos < stop);
   return false;  // no, we did not load a cue point
 }
 
 bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP,
                 const CuePoint::TrackPosition*& pTP) const {
-  assert(time_ns >= 0);
-  assert(pTrack);
-
-  if (m_cue_points == NULL)
-    return false;
-
-  if (m_count == 0)
+  if (time_ns < 0 || pTrack == NULL || m_cue_points == NULL || m_count == 0)
     return false;
 
   CuePoint** const ii = m_cue_points;
@@ -1906,7 +2019,8 @@ bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP,
   CuePoint** j = jj;
 
   pCP = *i;
-  assert(pCP);
+  if (pCP == NULL)
+    return false;
 
   if (time_ns <= pCP->GetTime(m_pSegment)) {
     pTP = pCP->Find(pTrack);
@@ -1920,10 +2034,12 @@ bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP,
     //[j, jj) > time_ns
 
     CuePoint** const k = i + (j - i) / 2;
-    assert(k < jj);
+    if (k >= jj)
+      return false;
 
     CuePoint* const pCP = *k;
-    assert(pCP);
+    if (pCP == NULL)
+      return false;
 
     const long long t = pCP->GetTime(m_pSegment);
 
@@ -1932,16 +2048,17 @@ bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP,
     else
       j = k;
 
-    assert(i <= j);
+    if (i > j)
+      return false;
   }
 
-  assert(i == j);
-  assert(i <= jj);
-  assert(i > ii);
+  if (i != j || i > jj || i <= ii)
+    return false;
 
   pCP = *--i;
-  assert(pCP);
-  assert(pCP->GetTime(m_pSegment) <= time_ns);
+
+  if (pCP == NULL || pCP->GetTime(m_pSegment) > time_ns)
+    return false;
 
   // TODO: here and elsewhere, it's probably not correct to search
   // for the cue point with this time, and then search for a matching
@@ -1956,55 +2073,50 @@ bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP,
 }
 
 const CuePoint* Cues::GetFirst() const {
-  if (m_cue_points == NULL)
-    return NULL;
-
-  if (m_count == 0)
+  if (m_cue_points == NULL || m_count == 0)
     return NULL;
 
   CuePoint* const* const pp = m_cue_points;
-  assert(pp);
+  if (pp == NULL)
+    return NULL;
 
   CuePoint* const pCP = pp[0];
-  assert(pCP);
-  assert(pCP->GetTimeCode() >= 0);
+  if (pCP == NULL || pCP->GetTimeCode() < 0)
+    return NULL;
 
   return pCP;
 }
 
 const CuePoint* Cues::GetLast() const {
-  if (m_cue_points == NULL)
-    return NULL;
-
-  if (m_count <= 0)
+  if (m_cue_points == NULL || m_count <= 0)
     return NULL;
 
   const long index = m_count - 1;
 
   CuePoint* const* const pp = m_cue_points;
-  assert(pp);
+  if (pp == NULL)
+    return NULL;
 
   CuePoint* const pCP = pp[index];
-  assert(pCP);
-  assert(pCP->GetTimeCode() >= 0);
+  if (pCP == NULL || pCP->GetTimeCode() < 0)
+    return NULL;
 
   return pCP;
 }
 
 const CuePoint* Cues::GetNext(const CuePoint* pCurr) const {
-  if (pCurr == NULL)
+  if (pCurr == NULL || pCurr->GetTimeCode() < 0 ||
+      m_cue_points == NULL || m_count < 1) {
     return NULL;
-
-  assert(pCurr->GetTimeCode() >= 0);
-  assert(m_cue_points);
-  assert(m_count >= 1);
+  }
 
   long index = pCurr->m_index;
-  assert(index < m_count);
+  if (index >= m_count)
+    return NULL;
 
   CuePoint* const* const pp = m_cue_points;
-  assert(pp);
-  assert(pp[index] == pCurr);
+  if (pp == NULL || pp[index] != pCurr)
+    return NULL;
 
   ++index;
 
@@ -2012,18 +2124,16 @@ const CuePoint* Cues::GetNext(const CuePoint* pCurr) const {
     return NULL;
 
   CuePoint* const pNext = pp[index];
-  assert(pNext);
-  assert(pNext->GetTimeCode() >= 0);
+
+  if (pNext == NULL || pNext->GetTimeCode() < 0)
+    return NULL;
 
   return pNext;
 }
 
 const BlockEntry* Cues::GetBlock(const CuePoint* pCP,
                                  const CuePoint::TrackPosition* pTP) const {
-  if (pCP == NULL)
-    return NULL;
-
-  if (pTP == NULL)
+  if (pCP == NULL || pTP == NULL)
     return NULL;
 
   return m_pSegment->GetBlock(*pCP, *pTP);
@@ -2070,11 +2180,15 @@ const BlockEntry* Segment::GetBlock(const CuePoint& cp,
   // assert(Cluster::HasBlockEntries(this, tp.m_pos));
 
   Cluster* const pCluster = Cluster::Create(this, -1, tp.m_pos);  //, -1);
-  assert(pCluster);
+  if (pCluster == NULL)
+    return NULL;
 
   const ptrdiff_t idx = i - m_clusters;
 
-  PreloadCluster(pCluster, idx);
+  if (!PreloadCluster(pCluster, idx)) {
+    delete pCluster;
+    return NULL;
+  }
   assert(m_clusters);
   assert(m_clusterPreloadCount > 0);
   assert(m_clusters[idx] == pCluster);
@@ -2125,12 +2239,15 @@ const Cluster* Segment::FindOrPreloadCluster(long long requested_pos) {
   // assert(Cluster::HasBlockEntries(this, tp.m_pos));
 
   Cluster* const pCluster = Cluster::Create(this, -1, requested_pos);
-  //-1);
-  assert(pCluster);
+  if (pCluster == NULL)
+    return NULL;
 
   const ptrdiff_t idx = i - m_clusters;
 
-  PreloadCluster(pCluster, idx);
+  if (!PreloadCluster(pCluster, idx)) {
+    delete pCluster;
+    return NULL;
+  }
   assert(m_clusters);
   assert(m_clusterPreloadCount > 0);
   assert(m_clusters[idx] == pCluster);
@@ -2168,9 +2285,8 @@ bool CuePoint::Load(IMkvReader* pReader) {
   {
     long len;
 
-    const long long id = ReadUInt(pReader, pos_, len);
-    assert(id == 0x3B);  // CuePoint ID
-    if (id != 0x3B)
+    const long long id = ReadID(pReader, pos_, len);
+    if (id != mkvmuxer::kMkvCuePoint)
       return false;
 
     pos_ += len;  // consume ID
@@ -2193,7 +2309,7 @@ bool CuePoint::Load(IMkvReader* pReader) {
   while (pos < stop) {
     long len;
 
-    const long long id = ReadUInt(pReader, pos, len);
+    const long long id = ReadID(pReader, pos, len);
     if ((id < 0) || (pos + len > stop)) {
       return false;
     }
@@ -2210,10 +2326,10 @@ bool CuePoint::Load(IMkvReader* pReader) {
       return false;
     }
 
-    if (id == 0x33)  // CueTime ID
+    if (id == mkvmuxer::kMkvCueTime)
       m_timecode = UnserializeUInt(pReader, pos, size);
 
-    else if (id == 0x37)  // CueTrackPosition(s) ID
+    else if (id == mkvmuxer::kMkvCueTrackPositions)
       ++m_track_positions_count;
 
     pos += size;  // consume payload
@@ -2227,7 +2343,9 @@ bool CuePoint::Load(IMkvReader* pReader) {
   //   << " timecode=" << m_timecode
   //   << endl;
 
-  m_track_positions = new TrackPosition[m_track_positions_count];
+  m_track_positions = new (std::nothrow) TrackPosition[m_track_positions_count];
+  if (m_track_positions == NULL)
+    return false;
 
   // Now parse track positions
 
@@ -2237,9 +2355,9 @@ bool CuePoint::Load(IMkvReader* pReader) {
   while (pos < stop) {
     long len;
 
-    const long long id = ReadUInt(pReader, pos, len);
-    assert(id >= 0);
-    assert((pos + len) <= stop);
+    const long long id = ReadID(pReader, pos, len);
+    if (id < 0 || (pos + len) > stop)
+      return false;
 
     pos += len;  // consume ID
 
@@ -2250,7 +2368,7 @@ bool CuePoint::Load(IMkvReader* pReader) {
     pos += len;  // consume Size field
     assert((pos + size) <= stop);
 
-    if (id == 0x37) {  // CueTrackPosition(s) ID
+    if (id == mkvmuxer::kMkvCueTrackPositions) {
       TrackPosition& tp = *p++;
       if (!tp.Parse(pReader, pos, size)) {
         return false;
@@ -2258,7 +2376,8 @@ bool CuePoint::Load(IMkvReader* pReader) {
     }
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+    if (pos > stop)
+      return false;
   }
 
   assert(size_t(p - m_track_positions) == m_track_positions_count);
@@ -2281,7 +2400,7 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
   while (pos < stop) {
     long len;
 
-    const long long id = ReadUInt(pReader, pos, len);
+    const long long id = ReadID(pReader, pos, len);
     if ((id < 0) || ((pos + len) > stop)) {
       return false;
     }
@@ -2298,13 +2417,11 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
       return false;
     }
 
-    if (id == 0x77)  // CueTrack ID
+    if (id == mkvmuxer::kMkvCueTrack)
       m_track = UnserializeUInt(pReader, pos, size);
-
-    else if (id == 0x71)  // CueClusterPos ID
+    else if (id == mkvmuxer::kMkvCueClusterPosition)
       m_pos = UnserializeUInt(pReader, pos, size);
-
-    else if (id == 0x1378)  // CueBlockNumber
+    else if (id == mkvmuxer::kMkvCueBlockNumber)
       m_block = UnserializeUInt(pReader, pos, size);
 
     pos += size;  // consume payload
@@ -2437,9 +2554,8 @@ const Cluster* Segment::GetNext(const Cluster* pCurr) {
     if (result != 0)
       return NULL;
 
-    const long long id = ReadUInt(m_pReader, pos, len);
-    assert(id == 0x0F43B675);  // Cluster ID
-    if (id != 0x0F43B675)
+    const long long id = ReadID(m_pReader, pos, len);
+    if (id != mkvmuxer::kMkvCluster)
       return NULL;
 
     pos += len;  // consume ID
@@ -2474,8 +2590,9 @@ const Cluster* Segment::GetNext(const Cluster* pCurr) {
 
     const long long idpos = pos;  // pos of next (potential) cluster
 
-    const long long id = ReadUInt(m_pReader, idpos, len);
-    assert(id > 0);  // TODO
+    const long long id = ReadID(m_pReader, idpos, len);
+    if (id < 0)
+      return NULL;
 
     pos += len;  // consume ID
 
@@ -2495,7 +2612,7 @@ const Cluster* Segment::GetNext(const Cluster* pCurr) {
     if (size == 0)  // weird
       continue;
 
-    if (id == 0x0F43B675) {  // Cluster ID
+    if (id == mkvmuxer::kMkvCluster) {
       const long long off_next_ = idpos - m_start;
 
       long long pos_;
@@ -2553,11 +2670,15 @@ const Cluster* Segment::GetNext(const Cluster* pCurr) {
   assert(i == j);
 
   Cluster* const pNext = Cluster::Create(this, -1, off_next);
-  assert(pNext);
+  if (pNext == NULL)
+    return NULL;
 
   const ptrdiff_t idx_next = i - m_clusters;  // insertion position
 
-  PreloadCluster(pNext, idx_next);
+  if (!PreloadCluster(pNext, idx_next)) {
+    delete pNext;
+    return NULL;
+  }
   assert(m_clusters);
   assert(idx_next < m_clusterSize);
   assert(m_clusters[idx_next] == pNext);
@@ -2641,7 +2762,7 @@ long Segment::ParseNext(const Cluster* pCurr, const Cluster*& pResult,
 
     const long long id = ReadUInt(m_pReader, pos, len);
 
-    if (id != 0x0F43B675)  // weird: not Cluster ID
+    if (id != mkvmuxer::kMkvCluster)
       return -1;
 
     pos += len;  // consume ID
@@ -2687,7 +2808,8 @@ long Segment::ParseNext(const Cluster* pCurr, const Cluster*& pResult,
     // Pos now points to start of payload
 
     pos += size;  // consume payload (that is, the current cluster)
-    assert((segment_stop < 0) || (pos <= segment_stop));
+    if (segment_stop >= 0 && pos > segment_stop)
+      return E_FILE_FORMAT_INVALID;
 
     // By consuming the payload, we are assuming that the curr
     // cluster isn't interesting.  That is, we don't bother checking
@@ -2755,7 +2877,7 @@ long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
     const long long idpos = pos;  // absolute
     const long long idoff = pos - m_start;  // relative
 
-    const long long id = ReadUInt(m_pReader, idpos, len);  // absolute
+    const long long id = ReadID(m_pReader, idpos, len);  // absolute
 
     if (id < 0)  // error
       return static_cast<long>(id);
@@ -2805,7 +2927,7 @@ long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
       return E_FILE_FORMAT_INVALID;
     }
 
-    if (id == 0x0C53BB6B) {  // Cues ID
+    if (id == mkvmuxer::kMkvCues) {
       if (size == unknown_size)
         return E_FILE_FORMAT_INVALID;
 
@@ -2818,22 +2940,26 @@ long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
       const long long element_size = element_stop - element_start;
 
       if (m_pCues == NULL) {
-        m_pCues = new Cues(this, pos, size, element_start, element_size);
-        assert(m_pCues);  // TODO
+        m_pCues = new (std::nothrow)
+            Cues(this, pos, size, element_start, element_size);
+        if (m_pCues == NULL)
+          return false;
       }
 
       pos += size;  // consume payload
-      assert((segment_stop < 0) || (pos <= segment_stop));
+      if (segment_stop >= 0 && pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
 
       continue;
     }
 
-    if (id != 0x0F43B675) {  // not a Cluster ID
+    if (id != mkvmuxer::kMkvCluster) {  // not a Cluster ID
       if (size == unknown_size)
         return E_FILE_FORMAT_INVALID;
 
       pos += size;  // consume payload
-      assert((segment_stop < 0) || (pos <= segment_stop));
+      if (segment_stop >= 0 && pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
 
       continue;
     }
@@ -2905,12 +3031,15 @@ long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
     Cluster* const pNext = Cluster::Create(this,
                                            -1,  // preloaded
                                            off_next);
-    // element_size);
-    assert(pNext);
+    if (pNext == NULL)
+      return -1;
 
     const ptrdiff_t idx_next = i - m_clusters;  // insertion position
 
-    PreloadCluster(pNext, idx_next);
+    if (!PreloadCluster(pNext, idx_next)) {
+      delete pNext;
+      return -1;
+    }
     assert(m_clusters);
     assert(idx_next < m_clusterSize);
     assert(m_clusters[idx_next] == pNext);
@@ -2953,7 +3082,7 @@ long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
         return E_BUFFER_NOT_FULL;
 
       const long long idpos = pos;
-      const long long id = ReadUInt(m_pReader, idpos, len);
+      const long long id = ReadID(m_pReader, idpos, len);
 
       if (id < 0)  // error (or underflow)
         return static_cast<long>(id);
@@ -2962,10 +3091,7 @@ long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
       // that we have exhausted the sub-element's inside the cluster
       // whose ID we parsed earlier.
 
-      if (id == 0x0F43B675)  // Cluster ID
-        break;
-
-      if (id == 0x0C53BB6B)  // Cues ID
+      if (id == mkvmuxer::kMkvCluster || id == mkvmuxer::kMkvCues)
         break;
 
       pos += len;  // consume ID (of sub-element)
@@ -3012,7 +3138,8 @@ long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
         return E_FILE_FORMAT_INVALID;
 
       pos += size;  // consume payload of sub-element
-      assert((segment_stop < 0) || (pos <= segment_stop));
+      if (segment_stop >= 0 && pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
     }  // determine cluster size
 
     cluster_size = pos - payload_pos;
@@ -3022,7 +3149,8 @@ long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
   }
 
   pos += cluster_size;  // consume payload
-  assert((segment_stop < 0) || (pos <= segment_stop));
+  if (segment_stop >= 0 && pos > segment_stop)
+    return E_FILE_FORMAT_INVALID;
 
   return 2;  // try to find a cluster that follows next
 }
@@ -3131,7 +3259,7 @@ long Chapters::Parse() {
     if (size == 0)  // weird
       continue;
 
-    if (id == 0x05B9) {  // EditionEntry ID
+    if (id == mkvmuxer::kMkvEditionEntry) {
       status = ParseEdition(pos, size);
 
       if (status < 0)  // error
@@ -3139,10 +3267,12 @@ long Chapters::Parse() {
     }
 
     pos += size;
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
   return 0;
 }
 
@@ -3242,10 +3372,10 @@ long Chapters::Edition::Parse(IMkvReader* pReader, long long pos,
     if (status < 0)  // error
       return status;
 
-    if (size == 0)  // weird
+    if (size == 0)
       continue;
 
-    if (id == 0x36) {  // Atom ID
+    if (id == mkvmuxer::kMkvChapterAtom) {
       status = ParseAtom(pReader, pos, size);
 
       if (status < 0)  // error
@@ -3253,10 +3383,12 @@ long Chapters::Edition::Parse(IMkvReader* pReader, long long pos,
     }
 
     pos += size;
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
   return 0;
 }
 
@@ -3373,20 +3505,20 @@ long Chapters::Atom::Parse(IMkvReader* pReader, long long pos, long long size) {
     if (status < 0)  // error
       return status;
 
-    if (size == 0)  // weird
+    if (size == 0)  // 0 length payload, skip.
       continue;
 
-    if (id == 0x00) {  // Display ID
+    if (id == mkvmuxer::kMkvChapterDisplay) {
       status = ParseDisplay(pReader, pos, size);
 
       if (status < 0)  // error
         return status;
-    } else if (id == 0x1654) {  // StringUID ID
+    } else if (id == mkvmuxer::kMkvChapterStringUID) {
       status = UnserializeString(pReader, pos, size, m_string_uid);
 
       if (status < 0)  // error
         return status;
-    } else if (id == 0x33C4) {  // UID ID
+    } else if (id == mkvmuxer::kMkvChapterUID) {
       long long val;
       status = UnserializeInt(pReader, pos, size, val);
 
@@ -3394,14 +3526,14 @@ long Chapters::Atom::Parse(IMkvReader* pReader, long long pos, long long size) {
         return status;
 
       m_uid = static_cast<unsigned long long>(val);
-    } else if (id == 0x11) {  // TimeStart ID
+    } else if (id == mkvmuxer::kMkvChapterTimeStart) {
       const long long val = UnserializeUInt(pReader, pos, size);
 
       if (val < 0)  // error
         return static_cast<long>(val);
 
       m_start_timecode = val;
-    } else if (id == 0x12) {  // TimeEnd ID
+    } else if (id == mkvmuxer::kMkvChapterTimeEnd) {
       const long long val = UnserializeUInt(pReader, pos, size);
 
       if (val < 0)  // error
@@ -3411,10 +3543,12 @@ long Chapters::Atom::Parse(IMkvReader* pReader, long long pos, long long size) {
     }
 
     pos += size;
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
   return 0;
 }
 
@@ -3524,20 +3658,20 @@ long Chapters::Display::Parse(IMkvReader* pReader, long long pos,
     if (status < 0)  // error
       return status;
 
-    if (size == 0)  // weird
+    if (size == 0)  // No payload.
       continue;
 
-    if (id == 0x05) {  // ChapterString ID
+    if (id == mkvmuxer::kMkvChapString) {
       status = UnserializeString(pReader, pos, size, m_string);
 
       if (status)
         return status;
-    } else if (id == 0x037C) {  // ChapterLanguage ID
+    } else if (id == mkvmuxer::kMkvChapLanguage) {
       status = UnserializeString(pReader, pos, size, m_language);
 
       if (status)
         return status;
-    } else if (id == 0x037E) {  // ChapterCountry ID
+    } else if (id == mkvmuxer::kMkvChapCountry) {
       status = UnserializeString(pReader, pos, size, m_country);
 
       if (status)
@@ -3545,10 +3679,12 @@ long Chapters::Display::Parse(IMkvReader* pReader, long long pos,
     }
 
     pos += size;
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
   return 0;
 }
 
@@ -3588,7 +3724,7 @@ long Tags::Parse() {
     if (size == 0)  // 0 length tag, read another
       continue;
 
-    if (id == 0x3373) {  // Tag ID
+    if (id == mkvmuxer::kMkvTag) {
       status = ParseTag(pos, size);
 
       if (status < 0)
@@ -3596,14 +3732,12 @@ long Tags::Parse() {
     }
 
     pos += size;
-    assert(pos <= stop);
     if (pos > stop)
-      return -1;
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
   if (pos != stop)
-    return -1;
+    return E_FILE_FORMAT_INVALID;
 
   return 0;
 }
@@ -3706,7 +3840,7 @@ long Tags::Tag::Parse(IMkvReader* pReader, long long pos, long long size) {
     if (size == 0)  // 0 length tag, read another
       continue;
 
-    if (id == 0x27C8) {  // SimpleTag ID
+    if (id == mkvmuxer::kMkvSimpleTag) {
       status = ParseSimpleTag(pReader, pos, size);
 
       if (status < 0)
@@ -3714,14 +3848,12 @@ long Tags::Tag::Parse(IMkvReader* pReader, long long pos, long long size) {
     }
 
     pos += size;
-    assert(pos <= stop);
     if (pos > stop)
-      return -1;
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
   if (pos != stop)
-    return -1;
+    return E_FILE_FORMAT_INVALID;
   return 0;
 }
 
@@ -3799,12 +3931,12 @@ long Tags::SimpleTag::Parse(IMkvReader* pReader, long long pos,
     if (size == 0)  // weird
       continue;
 
-    if (id == 0x5A3) {  // TagName ID
+    if (id == mkvmuxer::kMkvTagName) {
       status = UnserializeString(pReader, pos, size, m_tag_name);
 
       if (status)
         return status;
-    } else if (id == 0x487) {  // TagString ID
+    } else if (id == mkvmuxer::kMkvTagString) {
       status = UnserializeString(pReader, pos, size, m_tag_string);
 
       if (status)
@@ -3812,14 +3944,12 @@ long Tags::SimpleTag::Parse(IMkvReader* pReader, long long pos,
     }
 
     pos += size;
-    assert(pos <= stop);
     if (pos > stop)
-      return -1;
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
   if (pos != stop)
-    return -1;
+    return E_FILE_FORMAT_INVALID;
   return 0;
 }
 
@@ -3866,12 +3996,12 @@ long SegmentInfo::Parse() {
     if (status < 0)  // error
       return status;
 
-    if (id == 0x0AD7B1) {  // Timecode Scale
+    if (id == mkvmuxer::kMkvTimecodeScale) {
       m_timecodeScale = UnserializeUInt(pReader, pos, size);
 
       if (m_timecodeScale <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x0489) {  // Segment duration
+    } else if (id == mkvmuxer::kMkvDuration) {
       const long status = UnserializeFloat(pReader, pos, size, m_duration);
 
       if (status < 0)
@@ -3879,19 +4009,19 @@ long SegmentInfo::Parse() {
 
       if (m_duration < 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x0D80) {  // MuxingApp
+    } else if (id == mkvmuxer::kMkvMuxingApp) {
       const long status =
           UnserializeString(pReader, pos, size, m_pMuxingAppAsUTF8);
 
       if (status)
         return status;
-    } else if (id == 0x1741) {  // WritingApp
+    } else if (id == mkvmuxer::kMkvWritingApp) {
       const long status =
           UnserializeString(pReader, pos, size, m_pWritingAppAsUTF8);
 
       if (status)
         return status;
-    } else if (id == 0x3BA9) {  // Title
+    } else if (id == mkvmuxer::kMkvTitle) {
       const long status = UnserializeString(pReader, pos, size, m_pTitleAsUTF8);
 
       if (status)
@@ -3899,10 +4029,17 @@ long SegmentInfo::Parse() {
     }
 
     pos += size;
-    assert(pos <= stop);
+
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
+  const double rollover_check = m_duration * m_timecodeScale;
+  if (rollover_check > LLONG_MAX)
+    return E_FILE_FORMAT_INVALID;
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
 
   return 0;
 }
@@ -4039,15 +4176,15 @@ long ContentEncoding::ParseContentEncAESSettingsEntry(
     if (status < 0)  // error
       return status;
 
-    if (id == 0x7E8) {
-      // AESSettingsCipherMode
+    if (id == mkvmuxer::kMkvAESSettingsCipherMode) {
       aes->cipher_mode = UnserializeUInt(pReader, pos, size);
       if (aes->cipher_mode != 1)
         return E_FILE_FORMAT_INVALID;
     }
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
   return 0;
@@ -4070,14 +4207,15 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
     if (status < 0)  // error
       return status;
 
-    if (id == 0x1034)  // ContentCompression ID
+    if (id == mkvmuxer::kMkvContentCompression)
       ++compression_count;
 
-    if (id == 0x1035)  // ContentEncryption ID
+    if (id == mkvmuxer::kMkvContentEncryption)
       ++encryption_count;
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
   if (compression_count <= 0 && encryption_count <= 0)
@@ -4108,19 +4246,15 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
     if (status < 0)  // error
       return status;
 
-    if (id == 0x1031) {
-      // ContentEncodingOrder
+    if (id == mkvmuxer::kMkvContentEncodingOrder) {
       encoding_order_ = UnserializeUInt(pReader, pos, size);
-    } else if (id == 0x1032) {
-      // ContentEncodingScope
+    } else if (id == mkvmuxer::kMkvContentEncodingScope) {
       encoding_scope_ = UnserializeUInt(pReader, pos, size);
       if (encoding_scope_ < 1)
         return -1;
-    } else if (id == 0x1033) {
-      // ContentEncodingType
+    } else if (id == mkvmuxer::kMkvContentEncodingType) {
       encoding_type_ = UnserializeUInt(pReader, pos, size);
-    } else if (id == 0x1034) {
-      // ContentCompression ID
+    } else if (id == mkvmuxer::kMkvContentCompression) {
       ContentCompression* const compression =
           new (std::nothrow) ContentCompression();
       if (!compression)
@@ -4132,8 +4266,7 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
         return status;
       }
       *compression_entries_end_++ = compression;
-    } else if (id == 0x1035) {
-      // ContentEncryption ID
+    } else if (id == mkvmuxer::kMkvContentEncryption) {
       ContentEncryption* const encryption =
           new (std::nothrow) ContentEncryption();
       if (!encryption)
@@ -4148,10 +4281,12 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
     }
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
   return 0;
 }
 
@@ -4172,21 +4307,18 @@ long ContentEncoding::ParseCompressionEntry(long long start, long long size,
     if (status < 0)  // error
       return status;
 
-    if (id == 0x254) {
-      // ContentCompAlgo
+    if (id == mkvmuxer::kMkvContentCompAlgo) {
       long long algo = UnserializeUInt(pReader, pos, size);
       if (algo < 0)
         return E_FILE_FORMAT_INVALID;
       compression->algo = algo;
       valid = true;
-    } else if (id == 0x255) {
-      // ContentCompSettings
+    } else if (id == mkvmuxer::kMkvContentCompSettings) {
       if (size <= 0)
         return E_FILE_FORMAT_INVALID;
 
       const size_t buflen = static_cast<size_t>(size);
-      typedef unsigned char* buf_t;
-      const buf_t buf = new (std::nothrow) unsigned char[buflen];
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
       if (buf == NULL)
         return -1;
 
@@ -4202,7 +4334,8 @@ long ContentEncoding::ParseCompressionEntry(long long start, long long size,
     }
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
   // ContentCompAlgo is mandatory
@@ -4227,13 +4360,11 @@ long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
     if (status < 0)  // error
       return status;
 
-    if (id == 0x7E1) {
-      // ContentEncAlgo
+    if (id == mkvmuxer::kMkvContentEncAlgo) {
       encryption->algo = UnserializeUInt(pReader, pos, size);
       if (encryption->algo != 5)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x7E2) {
-      // ContentEncKeyID
+    } else if (id == mkvmuxer::kMkvContentEncKeyID) {
       delete[] encryption->key_id;
       encryption->key_id = NULL;
       encryption->key_id_len = 0;
@@ -4242,8 +4373,7 @@ long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
         return E_FILE_FORMAT_INVALID;
 
       const size_t buflen = static_cast<size_t>(size);
-      typedef unsigned char* buf_t;
-      const buf_t buf = new (std::nothrow) unsigned char[buflen];
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
       if (buf == NULL)
         return -1;
 
@@ -4256,8 +4386,7 @@ long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
 
       encryption->key_id = buf;
       encryption->key_id_len = buflen;
-    } else if (id == 0x7E3) {
-      // ContentSignature
+    } else if (id == mkvmuxer::kMkvContentSignature) {
       delete[] encryption->signature;
       encryption->signature = NULL;
       encryption->signature_len = 0;
@@ -4266,8 +4395,7 @@ long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
         return E_FILE_FORMAT_INVALID;
 
       const size_t buflen = static_cast<size_t>(size);
-      typedef unsigned char* buf_t;
-      const buf_t buf = new (std::nothrow) unsigned char[buflen];
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
       if (buf == NULL)
         return -1;
 
@@ -4280,8 +4408,7 @@ long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
 
       encryption->signature = buf;
       encryption->signature_len = buflen;
-    } else if (id == 0x7E4) {
-      // ContentSigKeyID
+    } else if (id == mkvmuxer::kMkvContentSigKeyID) {
       delete[] encryption->sig_key_id;
       encryption->sig_key_id = NULL;
       encryption->sig_key_id_len = 0;
@@ -4290,8 +4417,7 @@ long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
         return E_FILE_FORMAT_INVALID;
 
       const size_t buflen = static_cast<size_t>(size);
-      typedef unsigned char* buf_t;
-      const buf_t buf = new (std::nothrow) unsigned char[buflen];
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
       if (buf == NULL)
         return -1;
 
@@ -4304,14 +4430,11 @@ long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
 
       encryption->sig_key_id = buf;
       encryption->sig_key_id_len = buflen;
-    } else if (id == 0x7E5) {
-      // ContentSigAlgo
+    } else if (id == mkvmuxer::kMkvContentSigAlgo) {
       encryption->sig_algo = UnserializeUInt(pReader, pos, size);
-    } else if (id == 0x7E6) {
-      // ContentSigHashAlgo
+    } else if (id == mkvmuxer::kMkvContentSigHashAlgo) {
       encryption->sig_hash_algo = UnserializeUInt(pReader, pos, size);
-    } else if (id == 0x7E7) {
-      // ContentEncAESSettings
+    } else if (id == mkvmuxer::kMkvContentEncAESSettings) {
       const long status = ParseContentEncAESSettingsEntry(
           pos, size, pReader, &encryption->aes_settings);
       if (status)
@@ -4319,7 +4442,8 @@ long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
     }
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
   return 0;
@@ -4418,7 +4542,7 @@ int Track::Info::CopyStr(char* Info::*str, Info& dst_) const {
 
   const size_t len = strlen(src);
 
-  dst = new (std::nothrow) char[len + 1];
+  dst = SafeArrayAlloc<char>(1, len + 1);
 
   if (dst == NULL)
     return -1;
@@ -4469,7 +4593,7 @@ int Track::Info::Copy(Info& dst) const {
     if (dst.codecPrivateSize != 0)
       return -1;
 
-    dst.codecPrivate = new (std::nothrow) unsigned char[codecPrivateSize];
+    dst.codecPrivate = SafeArrayAlloc<unsigned char>(1, codecPrivateSize);
 
     if (dst.codecPrivate == NULL)
       return -1;
@@ -4797,11 +4921,12 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
       return status;
 
     // pos now designates start of element
-    if (id == 0x2240)  // ContentEncoding ID
+    if (id == mkvmuxer::kMkvContentEncoding)
       ++count;
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
   if (count <= 0)
@@ -4821,7 +4946,7 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
       return status;
 
     // pos now designates start of element
-    if (id == 0x2240) {  // ContentEncoding ID
+    if (id == mkvmuxer::kMkvContentEncoding) {
       ContentEncoding* const content_encoding =
           new (std::nothrow) ContentEncoding();
       if (!content_encoding)
@@ -4837,10 +4962,12 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
     }
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
 
   return 0;
 }
@@ -4892,37 +5019,37 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
     if (status < 0)  // error
       return status;
 
-    if (id == 0x30) {  // pixel width
+    if (id == mkvmuxer::kMkvPixelWidth) {
       width = UnserializeUInt(pReader, pos, size);
 
       if (width <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x3A) {  // pixel height
+    } else if (id == mkvmuxer::kMkvPixelHeight) {
       height = UnserializeUInt(pReader, pos, size);
 
       if (height <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x14B0) {  // display width
+    } else if (id == mkvmuxer::kMkvDisplayWidth) {
       display_width = UnserializeUInt(pReader, pos, size);
 
       if (display_width <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x14BA) {  // display height
+    } else if (id == mkvmuxer::kMkvDisplayHeight) {
       display_height = UnserializeUInt(pReader, pos, size);
 
       if (display_height <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x14B2) {  // display unit
+    } else if (id == mkvmuxer::kMkvDisplayUnit) {
       display_unit = UnserializeUInt(pReader, pos, size);
 
       if (display_unit < 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x13B8) {  // stereo mode
+    } else if (id == mkvmuxer::kMkvStereoMode) {
       stereo_mode = UnserializeUInt(pReader, pos, size);
 
       if (stereo_mode < 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x0383E3) {  // frame rate
+    } else if (id == mkvmuxer::kMkvFrameRate) {
       const long status = UnserializeFloat(pReader, pos, size, rate);
 
       if (status < 0)
@@ -4933,10 +5060,12 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
     }
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
 
   VideoTrack* const pTrack =
       new (std::nothrow) VideoTrack(pSegment, element_start, element_size);
@@ -5110,7 +5239,7 @@ long AudioTrack::Parse(Segment* pSegment, const Info& info,
     if (status < 0)  // error
       return status;
 
-    if (id == 0x35) {  // Sample Rate
+    if (id == mkvmuxer::kMkvSamplingFrequency) {
       status = UnserializeFloat(pReader, pos, size, rate);
 
       if (status < 0)
@@ -5118,12 +5247,12 @@ long AudioTrack::Parse(Segment* pSegment, const Info& info,
 
       if (rate <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x1F) {  // Channel Count
+    } else if (id == mkvmuxer::kMkvChannels) {
       channels = UnserializeUInt(pReader, pos, size);
 
       if (channels <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x2264) {  // Bit Depth
+    } else if (id == mkvmuxer::kMkvBitDepth) {
       bit_depth = UnserializeUInt(pReader, pos, size);
 
       if (bit_depth <= 0)
@@ -5131,10 +5260,12 @@ long AudioTrack::Parse(Segment* pSegment, const Info& info,
     }
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
 
   AudioTrack* const pTrack =
       new (std::nothrow) AudioTrack(pSegment, element_start, element_size);
@@ -5194,14 +5325,16 @@ long Tracks::Parse() {
     if (size == 0)  // weird
       continue;
 
-    if (id == 0x2E)  // TrackEntry ID
+    if (id == mkvmuxer::kMkvTrackEntry)
       ++count;
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
 
   if (count <= 0)
     return 0;  // success
@@ -5234,13 +5367,12 @@ long Tracks::Parse() {
 
     const long long element_size = payload_stop - element_start;
 
-    if (id == 0x2E) {  // TrackEntry ID
+    if (id == mkvmuxer::kMkvTrackEntry) {
       Track*& pTrack = *m_trackEntriesEnd;
       pTrack = NULL;
 
       const long status = ParseTrackEntry(pos, payload_size, element_start,
                                           element_size, pTrack);
-
       if (status)
         return status;
 
@@ -5249,10 +5381,12 @@ long Tracks::Parse() {
     }
 
     pos = payload_stop;
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == stop);
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
 
   return 0;  // success
 }
@@ -5309,16 +5443,16 @@ long Tracks::ParseTrackEntry(long long track_start, long long track_size,
 
     const long long start = pos;
 
-    if (id == 0x60) {  // VideoSettings ID
+    if (id == mkvmuxer::kMkvVideo) {
       v.start = start;
       v.size = size;
-    } else if (id == 0x61) {  // AudioSettings ID
+    } else if (id == mkvmuxer::kMkvAudio) {
       a.start = start;
       a.size = size;
-    } else if (id == 0x2D80) {  // ContentEncodings ID
+    } else if (id == mkvmuxer::kMkvContentEncodings) {
       e.start = start;
       e.size = size;
-    } else if (id == 0x33C5) {  // Track UID
+    } else if (id == mkvmuxer::kMkvTrackUID) {
       if (size > 8)
         return E_FILE_FORMAT_INVALID;
 
@@ -5340,49 +5474,49 @@ long Tracks::ParseTrackEntry(long long track_start, long long track_size,
 
         ++pos_;
       }
-    } else if (id == 0x57) {  // Track Number
+    } else if (id == mkvmuxer::kMkvTrackNumber) {
       const long long num = UnserializeUInt(pReader, pos, size);
 
       if ((num <= 0) || (num > 127))
         return E_FILE_FORMAT_INVALID;
 
       info.number = static_cast<long>(num);
-    } else if (id == 0x03) {  // Track Type
+    } else if (id == mkvmuxer::kMkvTrackType) {
       const long long type = UnserializeUInt(pReader, pos, size);
 
       if ((type <= 0) || (type > 254))
         return E_FILE_FORMAT_INVALID;
 
       info.type = static_cast<long>(type);
-    } else if (id == 0x136E) {  // Track Name
+    } else if (id == mkvmuxer::kMkvName) {
       const long status =
           UnserializeString(pReader, pos, size, info.nameAsUTF8);
 
       if (status)
         return status;
-    } else if (id == 0x02B59C) {  // Track Language
+    } else if (id == mkvmuxer::kMkvLanguage) {
       const long status = UnserializeString(pReader, pos, size, info.language);
 
       if (status)
         return status;
-    } else if (id == 0x03E383) {  // Default Duration
+    } else if (id == mkvmuxer::kMkvDefaultDuration) {
       const long long duration = UnserializeUInt(pReader, pos, size);
 
       if (duration < 0)
         return E_FILE_FORMAT_INVALID;
 
       info.defaultDuration = static_cast<unsigned long long>(duration);
-    } else if (id == 0x06) {  // CodecID
+    } else if (id == mkvmuxer::kMkvCodecID) {
       const long status = UnserializeString(pReader, pos, size, info.codecId);
 
       if (status)
         return status;
-    } else if (id == 0x1C) {  // lacing
+    } else if (id == mkvmuxer::kMkvFlagLacing) {
       lacing = UnserializeUInt(pReader, pos, size);
 
       if ((lacing < 0) || (lacing > 1))
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x23A2) {  // Codec Private
+    } else if (id == mkvmuxer::kMkvCodecPrivate) {
       delete[] info.codecPrivate;
       info.codecPrivate = NULL;
       info.codecPrivateSize = 0;
@@ -5390,9 +5524,7 @@ long Tracks::ParseTrackEntry(long long track_start, long long track_size,
       const size_t buflen = static_cast<size_t>(size);
 
       if (buflen) {
-        typedef unsigned char* buf_t;
-
-        const buf_t buf = new (std::nothrow) unsigned char[buflen];
+        unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
 
         if (buf == NULL)
           return -1;
@@ -5407,23 +5539,25 @@ long Tracks::ParseTrackEntry(long long track_start, long long track_size,
         info.codecPrivate = buf;
         info.codecPrivateSize = buflen;
       }
-    } else if (id == 0x058688) {  // Codec Name
+    } else if (id == mkvmuxer::kMkvCodecName) {
       const long status =
           UnserializeString(pReader, pos, size, info.codecNameAsUTF8);
 
       if (status)
         return status;
-    } else if (id == 0x16AA) {  // Codec Delay
+    } else if (id == mkvmuxer::kMkvCodecDelay) {
       info.codecDelay = UnserializeUInt(pReader, pos, size);
-    } else if (id == 0x16BB) {  // Seek Pre Roll
+    } else if (id == mkvmuxer::kMkvSeekPreRoll) {
       info.seekPreRoll = UnserializeUInt(pReader, pos, size);
     }
 
     pos += size;  // consume payload
-    assert(pos <= track_stop);
+    if (pos > track_stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == track_stop);
+  if (pos != track_stop)
+    return E_FILE_FORMAT_INVALID;
 
   if (info.number <= 0)  // not specified
     return E_FILE_FORMAT_INVALID;
@@ -5552,97 +5686,87 @@ const Track* Tracks::GetTrackByIndex(unsigned long idx) const {
 }
 
 long Cluster::Load(long long& pos, long& len) const {
-  assert(m_pSegment);
-  assert(m_pos >= m_element_start);
+  if (m_pSegment == NULL)
+    return E_PARSE_FAILED;
 
   if (m_timecode >= 0)  // at least partially loaded
     return 0;
 
-  assert(m_pos == m_element_start);
-  assert(m_element_size < 0);
+  if (m_pos != m_element_start || m_element_size >= 0)
+    return E_PARSE_FAILED;
 
   IMkvReader* const pReader = m_pSegment->m_pReader;
-
   long long total, avail;
-
   const int status = pReader->Length(&total, &avail);
 
   if (status < 0)  // error
     return status;
 
-  assert((total < 0) || (avail <= total));
-  assert((total < 0) || (m_pos <= total));  // TODO: verify this
+  if (total >= 0 && (avail > total || m_pos > total))
+    return E_FILE_FORMAT_INVALID;
 
   pos = m_pos;
 
   long long cluster_size = -1;
 
-  {
-    if ((pos + 1) > avail) {
-      len = 1;
-      return E_BUFFER_NOT_FULL;
-    }
-
-    long long result = GetUIntLength(pReader, pos, len);
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
 
-    if (result < 0)  // error or underflow
-      return static_cast<long>(result);
+  long long result = GetUIntLength(pReader, pos, len);
 
-    if (result > 0)  // underflow (weird)
-      return E_BUFFER_NOT_FULL;
+  if (result < 0)  // error or underflow
+    return static_cast<long>(result);
 
-    // if ((pos + len) > segment_stop)
-    //    return E_FILE_FORMAT_INVALID;
+  if (result > 0)
+    return E_BUFFER_NOT_FULL;
 
-    if ((pos + len) > avail)
-      return E_BUFFER_NOT_FULL;
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
 
-    const long long id_ = ReadUInt(pReader, pos, len);
+  const long long id_ = ReadID(pReader, pos, len);
 
-    if (id_ < 0)  // error
-      return static_cast<long>(id_);
+  if (id_ < 0)  // error
+    return static_cast<long>(id_);
 
-    if (id_ != 0x0F43B675)  // Cluster ID
-      return E_FILE_FORMAT_INVALID;
+  if (id_ != mkvmuxer::kMkvCluster)
+    return E_FILE_FORMAT_INVALID;
 
-    pos += len;  // consume id
+  pos += len;  // consume id
 
-    // read cluster size
+  // read cluster size
 
-    if ((pos + 1) > avail) {
-      len = 1;
-      return E_BUFFER_NOT_FULL;
-    }
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
 
-    result = GetUIntLength(pReader, pos, len);
+  result = GetUIntLength(pReader, pos, len);
 
-    if (result < 0)  // error
-      return static_cast<long>(result);
+  if (result < 0)  // error
+    return static_cast<long>(result);
 
-    if (result > 0)  // weird
-      return E_BUFFER_NOT_FULL;
+  if (result > 0)
+    return E_BUFFER_NOT_FULL;
 
-    // if ((pos + len) > segment_stop)
-    //    return E_FILE_FORMAT_INVALID;
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
 
-    if ((pos + len) > avail)
-      return E_BUFFER_NOT_FULL;
+  const long long size = ReadUInt(pReader, pos, len);
 
-    const long long size = ReadUInt(pReader, pos, len);
+  if (size < 0)  // error
+    return static_cast<long>(cluster_size);
 
-    if (size < 0)  // error
-      return static_cast<long>(cluster_size);
+  if (size == 0)
+    return E_FILE_FORMAT_INVALID;
 
-    if (size == 0)
-      return E_FILE_FORMAT_INVALID;  // TODO: verify this
+  pos += len;  // consume length of size of element
 
-    pos += len;  // consume length of size of element
+  const long long unknown_size = (1LL << (7 * len)) - 1;
 
-    const long long unknown_size = (1LL << (7 * len)) - 1;
-
-    if (size != unknown_size)
-      cluster_size = size;
-  }
+  if (size != unknown_size)
+    cluster_size = size;
 
   // pos points to start of payload
   long long timecode = -1;
@@ -5667,7 +5791,7 @@ long Cluster::Load(long long& pos, long& len) const {
     if (result < 0)  // error
       return static_cast<long>(result);
 
-    if (result > 0)  // weird
+    if (result > 0)
       return E_BUFFER_NOT_FULL;
 
     if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
@@ -5676,7 +5800,7 @@ long Cluster::Load(long long& pos, long& len) const {
     if ((pos + len) > avail)
       return E_BUFFER_NOT_FULL;
 
-    const long long id = ReadUInt(pReader, pos, len);
+    const long long id = ReadID(pReader, pos, len);
 
     if (id < 0)  // error
       return static_cast<long>(id);
@@ -5688,10 +5812,10 @@ long Cluster::Load(long long& pos, long& len) const {
     // that we have exhausted the sub-element's inside the cluster
     // whose ID we parsed earlier.
 
-    if (id == 0x0F43B675)  // Cluster ID
+    if (id == mkvmuxer::kMkvCluster)
       break;
 
-    if (id == 0x0C53BB6B)  // Cues ID
+    if (id == mkvmuxer::kMkvCues)
       break;
 
     pos += len;  // consume ID field
@@ -5708,7 +5832,7 @@ long Cluster::Load(long long& pos, long& len) const {
     if (result < 0)  // error
       return static_cast<long>(result);
 
-    if (result > 0)  // weird
+    if (result > 0)
       return E_BUFFER_NOT_FULL;
 
     if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
@@ -5734,13 +5858,13 @@ long Cluster::Load(long long& pos, long& len) const {
 
     // pos now points to start of payload
 
-    if (size == 0)  // weird
+    if (size == 0)
       continue;
 
     if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
       return E_FILE_FORMAT_INVALID;
 
-    if (id == 0x67) {  // TimeCode ID
+    if (id == mkvmuxer::kMkvTimecode) {
       len = static_cast<long>(size);
 
       if ((pos + size) > avail)
@@ -5755,19 +5879,21 @@ long Cluster::Load(long long& pos, long& len) const {
 
       if (bBlock)
         break;
-    } else if (id == 0x20) {  // BlockGroup ID
+    } else if (id == mkvmuxer::kMkvBlockGroup) {
       bBlock = true;
       break;
-    } else if (id == 0x23) {  // SimpleBlock ID
+    } else if (id == mkvmuxer::kMkvSimpleBlock) {
       bBlock = true;
       break;
     }
 
     pos += size;  // consume payload
-    assert((cluster_stop < 0) || (pos <= cluster_stop));
+    if (cluster_stop >= 0 && pos > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert((cluster_stop < 0) || (pos <= cluster_stop));
+  if (cluster_stop >= 0 && pos > cluster_stop)
+    return E_FILE_FORMAT_INVALID;
 
   if (timecode < 0)  // no timecode found
     return E_FILE_FORMAT_INVALID;
@@ -5790,10 +5916,8 @@ long Cluster::Parse(long long& pos, long& len) const {
   if (status < 0)
     return status;
 
-  assert(m_pos >= m_element_start);
-  assert(m_timecode >= 0);
-  // assert(m_size > 0);
-  // assert(m_element_size > m_size);
+  if (m_pos < m_element_start || m_timecode < 0)
+    return E_PARSE_FAILED;
 
   const long long cluster_stop =
       (m_element_size < 0) ? -1 : m_element_start + m_element_size;
@@ -5810,7 +5934,8 @@ long Cluster::Parse(long long& pos, long& len) const {
   if (status < 0)  // error
     return status;
 
-  assert((total < 0) || (avail <= total));
+  if (total >= 0 && avail > total)
+    return E_FILE_FORMAT_INVALID;
 
   pos = m_pos;
 
@@ -5837,7 +5962,7 @@ long Cluster::Parse(long long& pos, long& len) const {
     if (result < 0)  // error
       return static_cast<long>(result);
 
-    if (result > 0)  // weird
+    if (result > 0)
       return E_BUFFER_NOT_FULL;
 
     if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
@@ -5846,19 +5971,16 @@ long Cluster::Parse(long long& pos, long& len) const {
     if ((pos + len) > avail)
       return E_BUFFER_NOT_FULL;
 
-    const long long id = ReadUInt(pReader, pos, len);
+    const long long id = ReadID(pReader, pos, len);
 
-    if (id < 0)  // error
-      return static_cast<long>(id);
-
-    if (id == 0)  // weird
+    if (id < 0)
       return E_FILE_FORMAT_INVALID;
 
     // This is the distinguished set of ID's we use to determine
     // that we have exhausted the sub-element's inside the cluster
     // whose ID we parsed earlier.
 
-    if ((id == 0x0F43B675) || (id == 0x0C53BB6B)) {  // Cluster or Cues ID
+    if ((id == mkvmuxer::kMkvCluster) || (id == mkvmuxer::kMkvCues)) {
       if (m_element_size < 0)
         m_element_size = pos - m_element_start;
 
@@ -5879,7 +6001,7 @@ long Cluster::Parse(long long& pos, long& len) const {
     if (result < 0)  // error
       return static_cast<long>(result);
 
-    if (result > 0)  // weird
+    if (result > 0)
       return E_BUFFER_NOT_FULL;
 
     if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
@@ -5905,7 +6027,7 @@ long Cluster::Parse(long long& pos, long& len) const {
 
     // pos now points to start of payload
 
-    if (size == 0)  // weird
+    if (size == 0)
       continue;
 
     // const long long block_start = pos;
@@ -5913,8 +6035,10 @@ long Cluster::Parse(long long& pos, long& len) const {
 
     if (cluster_stop >= 0) {
       if (block_stop > cluster_stop) {
-        if ((id == 0x20) || (id == 0x23))
+        if (id == mkvmuxer::kMkvBlockGroup ||
+            id == mkvmuxer::kMkvSimpleBlock) {
           return E_FILE_FORMAT_INVALID;
+        }
 
         pos = cluster_stop;
         break;
@@ -5930,42 +6054,48 @@ long Cluster::Parse(long long& pos, long& len) const {
 
     Cluster* const this_ = const_cast<Cluster*>(this);
 
-    if (id == 0x20)  // BlockGroup
+    if (id == mkvmuxer::kMkvBlockGroup)
       return this_->ParseBlockGroup(size, pos, len);
 
-    if (id == 0x23)  // SimpleBlock
+    if (id == mkvmuxer::kMkvSimpleBlock)
       return this_->ParseSimpleBlock(size, pos, len);
 
     pos += size;  // consume payload
-    assert((cluster_stop < 0) || (pos <= cluster_stop));
+    if (cluster_stop >= 0 && pos > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(m_element_size > 0);
+  if (m_element_size < 1)
+    return E_FILE_FORMAT_INVALID;
 
   m_pos = pos;
-  assert((cluster_stop < 0) || (m_pos <= cluster_stop));
+  if (cluster_stop >= 0 && m_pos > cluster_stop)
+    return E_FILE_FORMAT_INVALID;
 
   if (m_entries_count > 0) {
     const long idx = m_entries_count - 1;
 
     const BlockEntry* const pLast = m_entries[idx];
-    assert(pLast);
+    if (pLast == NULL)
+      return E_PARSE_FAILED;
 
     const Block* const pBlock = pLast->GetBlock();
-    assert(pBlock);
+    if (pBlock == NULL)
+      return E_PARSE_FAILED;
 
     const long long start = pBlock->m_start;
 
     if ((total >= 0) && (start > total))
-      return -1;  // defend against trucated stream
+      return E_PARSE_FAILED;  // defend against trucated stream
 
     const long long size = pBlock->m_size;
 
     const long long stop = start + size;
-    assert((cluster_stop < 0) || (stop <= cluster_stop));
+    if (cluster_stop >= 0 && stop > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
 
     if ((total >= 0) && (stop > total))
-      return -1;  // defend against trucated stream
+      return E_PARSE_FAILED;  // defend against trucated stream
   }
 
   return 1;  // no more entries
@@ -6058,7 +6188,7 @@ long Cluster::ParseSimpleBlock(long long block_size, long long& pos,
     return E_BUFFER_NOT_FULL;
   }
 
-  status = CreateBlock(0x23,  // simple block id
+  status = CreateBlock(mkvmuxer::kMkvSimpleBlock,
                        block_start, block_size,
                        0);  // DiscardPadding
 
@@ -6118,12 +6248,12 @@ long Cluster::ParseBlockGroup(long long payload_size, long long& pos,
     if ((pos + len) > avail)
       return E_BUFFER_NOT_FULL;
 
-    const long long id = ReadUInt(pReader, pos, len);
+    const long long id = ReadID(pReader, pos, len);
 
     if (id < 0)  // error
       return static_cast<long>(id);
 
-    if (id == 0)  // not a value ID
+    if (id == 0)  // not a valid ID
       return E_FILE_FORMAT_INVALID;
 
     pos += len;  // consume ID field
@@ -6169,14 +6299,14 @@ long Cluster::ParseBlockGroup(long long payload_size, long long& pos,
     if (size == unknown_size)
       return E_FILE_FORMAT_INVALID;
 
-    if (id == 0x35A2) {  // DiscardPadding
+    if (id == mkvmuxer::kMkvDiscardPadding) {
       status = UnserializeInt(pReader, pos, size, discard_padding);
 
       if (status < 0)  // error
         return status;
     }
 
-    if (id != 0x21) {  // sub-part of BlockGroup is not a Block
+    if (id != mkvmuxer::kMkvBlock) {
       pos += size;  // consume sub-part of block group
 
       if (pos > payload_stop)
@@ -6262,12 +6392,14 @@ long Cluster::ParseBlockGroup(long long payload_size, long long& pos,
     }
 
     pos = block_stop;  // consume block-part of block group
-    assert(pos <= payload_stop);
+    if (pos > payload_stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
-  assert(pos == payload_stop);
+  if (pos != payload_stop)
+    return E_FILE_FORMAT_INVALID;
 
-  status = CreateBlock(0x20,  // BlockGroup ID
+  status = CreateBlock(mkvmuxer::kMkvBlockGroup,
                        payload_start, payload_size, discard_padding);
   if (status != 0)
     return status;
@@ -6310,17 +6442,14 @@ long Cluster::GetEntry(long index, const mkvparser::BlockEntry*& pEntry) const {
   return E_BUFFER_NOT_FULL;  // underflow, since more remains to be parsed
 }
 
-Cluster* Cluster::Create(Segment* pSegment, long idx, long long off)
-// long long element_size)
-{
-  assert(pSegment);
-  assert(off >= 0);
+Cluster* Cluster::Create(Segment* pSegment, long idx, long long off) {
+  if (!pSegment || off < 0)
+    return NULL;
 
   const long long element_start = pSegment->m_start + off;
 
-  Cluster* const pCluster = new Cluster(pSegment, idx, element_start);
-  // element_size);
-  assert(pCluster);
+  Cluster* const pCluster =
+      new (std::nothrow) Cluster(pSegment, idx, element_start);
 
   return pCluster;
 }
@@ -6431,13 +6560,13 @@ long Cluster::HasBlockEntries(
     if ((pos + len) > avail)
       return E_BUFFER_NOT_FULL;
 
-    const long long id = ReadUInt(pReader, pos, len);
+    const long long id = ReadID(pReader, pos, len);
 
     if (id < 0)  // error
       return static_cast<long>(id);
 
-    if (id != 0x0F43B675)  // weird: not cluster ID
-      return -1;  // generic error
+    if (id != mkvmuxer::kMkvCluster)
+      return E_PARSE_FAILED;
 
     pos += len;  // consume Cluster ID field
 
@@ -6515,7 +6644,7 @@ long Cluster::HasBlockEntries(
     if ((pos + len) > avail)
       return E_BUFFER_NOT_FULL;
 
-    const long long id = ReadUInt(pReader, pos, len);
+    const long long id = ReadID(pReader, pos, len);
 
     if (id < 0)  // error
       return static_cast<long>(id);
@@ -6524,10 +6653,10 @@ long Cluster::HasBlockEntries(
     // that we have exhausted the sub-element's inside the cluster
     // whose ID we parsed earlier.
 
-    if (id == 0x0F43B675)  // Cluster ID
+    if (id == mkvmuxer::kMkvCluster)
       return 0;  // no entries found
 
-    if (id == 0x0C53BB6B)  // Cues ID
+    if (id == mkvmuxer::kMkvCues)
       return 0;  // no entries found
 
     pos += len;  // consume id field
@@ -6579,14 +6708,15 @@ long Cluster::HasBlockEntries(
     if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
       return E_FILE_FORMAT_INVALID;
 
-    if (id == 0x20)  // BlockGroup ID
+    if (id == mkvmuxer::kMkvBlockGroup)
       return 1;  // have at least one entry
 
-    if (id == 0x23)  // SimpleBlock ID
+    if (id == mkvmuxer::kMkvSimpleBlock)
       return 1;  // have at least one entry
 
     pos += size;  // consume payload
-    assert((cluster_stop < 0) || (pos <= cluster_stop));
+    if (cluster_stop >= 0 && pos > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
   }
 }
 
@@ -6656,14 +6786,17 @@ long long Cluster::GetLastTime() const {
 long Cluster::CreateBlock(long long id,
                           long long pos,  // absolute pos of payload
                           long long size, long long discard_padding) {
-  assert((id == 0x20) || (id == 0x23));  // BlockGroup or SimpleBlock
+  if (id != mkvmuxer::kMkvBlockGroup && id != mkvmuxer::kMkvSimpleBlock)
+    return E_PARSE_FAILED;
 
   if (m_entries_count < 0) {  // haven't parsed anything yet
     assert(m_entries == NULL);
     assert(m_entries_size == 0);
 
     m_entries_size = 1024;
-    m_entries = new BlockEntry*[m_entries_size];
+    m_entries = new (std::nothrow) BlockEntry*[m_entries_size];
+    if (m_entries == NULL)
+      return -1;
 
     m_entries_count = 0;
   } else {
@@ -6674,8 +6807,9 @@ long Cluster::CreateBlock(long long id,
     if (m_entries_count >= m_entries_size) {
       const long entries_size = 2 * m_entries_size;
 
-      BlockEntry** const entries = new BlockEntry*[entries_size];
-      assert(entries);
+      BlockEntry** const entries = new (std::nothrow) BlockEntry*[entries_size];
+      if (entries == NULL)
+        return -1;
 
       BlockEntry** src = m_entries;
       BlockEntry** const src_end = src + m_entries_count;
@@ -6692,9 +6826,9 @@ long Cluster::CreateBlock(long long id,
     }
   }
 
-  if (id == 0x20)  // BlockGroup ID
+  if (id == mkvmuxer::kMkvBlockGroup)
     return CreateBlockGroup(pos, size, discard_padding);
-  else  // SimpleBlock ID
+  else
     return CreateSimpleBlock(pos, size);
 }
 
@@ -6725,9 +6859,9 @@ long Cluster::CreateBlockGroup(long long start_offset, long long size,
 
   while (pos < stop) {
     long len;
-    const long long id = ReadUInt(pReader, pos, len);
-    assert(id >= 0);  // TODO
-    assert((pos + len) <= stop);
+    const long long id = ReadID(pReader, pos, len);
+    if (id < 0 || (pos + len) > stop)
+      return E_FILE_FORMAT_INVALID;
 
     pos += len;  // consume ID
 
@@ -6737,12 +6871,12 @@ long Cluster::CreateBlockGroup(long long start_offset, long long size,
 
     pos += len;  // consume size
 
-    if (id == 0x21) {  // Block ID
+    if (id == mkvmuxer::kMkvBlock) {
       if (bpos < 0) {  // Block ID
         bpos = pos;
         bsize = size;
       }
-    } else if (id == 0x1B) {  // Duration ID
+    } else if (id == mkvmuxer::kMkvBlockDuration) {
       if (size > 8)
         return E_FILE_FORMAT_INVALID;
 
@@ -6750,7 +6884,7 @@ long Cluster::CreateBlockGroup(long long start_offset, long long size,
 
       if (duration < 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == 0x7B) {  // ReferenceBlock
+    } else if (id == mkvmuxer::kMkvReferenceBlock) {
       if (size > 8 || size <= 0)
         return E_FILE_FORMAT_INVALID;
       const long size_ = static_cast<long>(size);
@@ -6764,17 +6898,19 @@ long Cluster::CreateBlockGroup(long long start_offset, long long size,
 
       if (time <= 0)  // see note above
         prev = time;
-      else  // weird
+      else
         next = time;
     }
 
     pos += size;  // consume payload
-    assert(pos <= stop);
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
   }
   if (bpos < 0)
     return E_FILE_FORMAT_INVALID;
 
-  assert(pos == stop);
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
   assert(bsize >= 0);
 
   const long idx = m_entries_count;
@@ -7213,7 +7349,9 @@ long Block::Parse(const Cluster* pCluster) {
       return E_FILE_FORMAT_INVALID;
 
     m_frame_count = 1;
-    m_frames = new Frame[m_frame_count];
+    m_frames = new (std::nothrow) Frame[m_frame_count];
+    if (m_frames == NULL)
+      return -1;
 
     Frame& f = m_frames[0];
     f.pos = pos;
@@ -7239,18 +7377,23 @@ long Block::Parse(const Cluster* pCluster) {
     return E_FILE_FORMAT_INVALID;
 
   ++pos;  // consume frame count
-  assert(pos <= stop);
+  if (pos > stop)
+    return E_FILE_FORMAT_INVALID;
 
   m_frame_count = int(biased_count) + 1;
 
-  m_frames = new Frame[m_frame_count];
-  assert(m_frames);
+  m_frames = new (std::nothrow) Frame[m_frame_count];
+  if (m_frames == NULL)
+    return -1;
+
+  if (!m_frames)
+    return E_FILE_FORMAT_INVALID;
 
   if (lacing == 1) {  // Xiph
     Frame* pf = m_frames;
     Frame* const pf_end = pf + m_frame_count;
 
-    long size = 0;
+    long long size = 0;
     int frame_count = m_frame_count;
 
     while (frame_count > 1) {
@@ -7277,6 +7420,8 @@ long Block::Parse(const Cluster* pCluster) {
 
       Frame& f = *pf++;
       assert(pf < pf_end);
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
 
       f.pos = 0;  // patch later
 
@@ -7289,8 +7434,8 @@ long Block::Parse(const Cluster* pCluster) {
       --frame_count;
     }
 
-    assert(pf < pf_end);
-    assert(pos <= stop);
+    if (pf >= pf_end || pos > stop)
+      return E_FILE_FORMAT_INVALID;
 
     {
       Frame& f = *pf++;
@@ -7318,11 +7463,17 @@ long Block::Parse(const Cluster* pCluster) {
       Frame& f = *pf++;
       assert((pos + f.len) <= stop);
 
+      if ((pos + f.len) > stop)
+        return E_FILE_FORMAT_INVALID;
+
       f.pos = pos;
       pos += f.len;
     }
 
     assert(pos == stop);
+    if (pos != stop)
+      return E_FILE_FORMAT_INVALID;
+
   } else if (lacing == 2) {  // fixed-size lacing
     if (pos >= stop)
       return E_FILE_FORMAT_INVALID;
@@ -7342,6 +7493,8 @@ long Block::Parse(const Cluster* pCluster) {
 
     while (pf != pf_end) {
       assert((pos + frame_size) <= stop);
+      if ((pos + frame_size) > stop)
+        return E_FILE_FORMAT_INVALID;
 
       Frame& f = *pf++;
 
@@ -7352,13 +7505,16 @@ long Block::Parse(const Cluster* pCluster) {
     }
 
     assert(pos == stop);
+    if (pos != stop)
+      return E_FILE_FORMAT_INVALID;
+
   } else {
     assert(lacing == 3);  // EBML lacing
 
     if (pos >= stop)
       return E_FILE_FORMAT_INVALID;
 
-    long size = 0;
+    long long size = 0;
     int frame_count = m_frame_count;
 
     long long frame_size = ReadUInt(pReader, pos, len);
@@ -7396,6 +7552,9 @@ long Block::Parse(const Cluster* pCluster) {
         return E_FILE_FORMAT_INVALID;
 
       assert(pf < pf_end);
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
 
       const Frame& prev = *pf++;
       assert(prev.len == frame_size);
@@ -7403,6 +7562,8 @@ long Block::Parse(const Cluster* pCluster) {
         return E_FILE_FORMAT_INVALID;
 
       assert(pf < pf_end);
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
 
       Frame& curr = *pf;
 
@@ -7417,7 +7578,8 @@ long Block::Parse(const Cluster* pCluster) {
         return E_FILE_FORMAT_INVALID;
 
       pos += len;  // consume length of (delta) size
-      assert(pos <= stop);
+      if (pos > stop)
+        return E_FILE_FORMAT_INVALID;
 
       const int exp = 7 * len - 1;
       const long long bias = (1LL << exp) - 1LL;
@@ -7439,18 +7601,20 @@ long Block::Parse(const Cluster* pCluster) {
 
     // parse last frame
     if (frame_count > 0) {
-      assert(pos <= stop);
-      assert(pf < pf_end);
+      if (pos > stop || pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
 
       const Frame& prev = *pf++;
       assert(prev.len == frame_size);
       if (prev.len != frame_size)
         return E_FILE_FORMAT_INVALID;
 
-      assert(pf < pf_end);
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
 
       Frame& curr = *pf++;
-      assert(pf == pf_end);
+      if (pf != pf_end)
+        return E_FILE_FORMAT_INVALID;
 
       curr.pos = 0;  // patch later
 
@@ -7471,6 +7635,8 @@ long Block::Parse(const Cluster* pCluster) {
     while (pf != pf_end) {
       Frame& f = *pf++;
       assert((pos + f.len) <= stop);
+      if ((pos + f.len) > stop)
+        return E_FILE_FORMAT_INVALID;
 
       f.pos = pos;
       pos += f.len;
diff --git a/libvpx/third_party/libwebm/mkvparser.hpp b/libvpx/third_party/libwebm/mkvparser.hpp
index aa0b4326..75ef69d7 100644
--- a/libvpx/third_party/libwebm/mkvparser.hpp
+++ b/libvpx/third_party/libwebm/mkvparser.hpp
@@ -9,12 +9,13 @@
 #ifndef MKVPARSER_HPP
 #define MKVPARSER_HPP
 
-#include <cstdlib>
-#include <cstdio>
 #include <cstddef>
+#include <cstdio>
+#include <cstdlib>
 
 namespace mkvparser {
 
+const int E_PARSE_FAILED = -1;
 const int E_FILE_FORMAT_INVALID = -2;
 const int E_BUFFER_NOT_FULL = -3;
 
@@ -27,8 +28,11 @@ class IMkvReader {
   virtual ~IMkvReader();
 };
 
+template<typename Type> Type* SafeArrayAlloc(unsigned long long num_elements,
+                                             unsigned long long element_size);
 long long GetUIntLength(IMkvReader*, long long, long&);
 long long ReadUInt(IMkvReader*, long long, long&);
+long long ReadID(IMkvReader* pReader, long long pos, long& len);
 long long UnserializeUInt(IMkvReader*, long long pos, long long size);
 
 long UnserializeFloat(IMkvReader*, long long pos, long long size, double&);
@@ -833,7 +837,7 @@ class Cues {
 
  private:
   bool Init() const;
-  void PreloadCuePoint(long&, long long) const;
+  bool PreloadCuePoint(long&, long long) const;
 
   mutable CuePoint** m_cue_points;
   mutable long m_count;
@@ -999,8 +1003,8 @@ class Segment {
   long DoLoadClusterUnknownSize(long long&, long&);
   long DoParseNext(const Cluster*&, long long&, long&);
 
-  void AppendCluster(Cluster*);
-  void PreloadCluster(Cluster*, ptrdiff_t);
+  bool AppendCluster(Cluster*);
+  bool PreloadCluster(Cluster*, ptrdiff_t);
 
   // void ParseSeekHead(long long pos, long long size);
   // void ParseSeekEntry(long long pos, long long size);
diff --git a/libvpx/third_party/libwebm/webmids.hpp b/libvpx/third_party/libwebm/webmids.hpp
index 6874e44e..ad4ab573 100644
--- a/libvpx/third_party/libwebm/webmids.hpp
+++ b/libvpx/third_party/libwebm/webmids.hpp
@@ -41,6 +41,7 @@ enum MkvId {
   kMkvTimecodeScale = 0x2AD7B1,
   kMkvDuration = 0x4489,
   kMkvDateUTC = 0x4461,
+  kMkvTitle = 0x7BA9,
   kMkvMuxingApp = 0x4D80,
   kMkvWritingApp = 0x5741,
   // Cluster
@@ -107,9 +108,16 @@ enum MkvId {
   kMkvContentEncodingOrder = 0x5031,
   kMkvContentEncodingScope = 0x5032,
   kMkvContentEncodingType = 0x5033,
+  kMkvContentCompression = 0x5034,
+  kMkvContentCompAlgo = 0x4254,
+  kMkvContentCompSettings = 0x4255,
   kMkvContentEncryption = 0x5035,
   kMkvContentEncAlgo = 0x47E1,
   kMkvContentEncKeyID = 0x47E2,
+  kMkvContentSignature = 0x47E3,
+  kMkvContentSigKeyID = 0x47E4,
+  kMkvContentSigAlgo = 0x47E5,
+  kMkvContentSigHashAlgo = 0x47E6,
   kMkvContentEncAESSettings = 0x47E7,
   kMkvAESSettingsCipherMode = 0x47E8,
   kMkvAESSettingsCipherInitData = 0x47E9,
diff --git a/libvpx/third_party/x86inc/README.libvpx b/libvpx/third_party/x86inc/README.libvpx
index fe5b0761..e91e305a 100644
--- a/libvpx/third_party/x86inc/README.libvpx
+++ b/libvpx/third_party/x86inc/README.libvpx
@@ -20,3 +20,5 @@ Copy PIC 'GLOBAL' macros from x86_abi_support.asm
 Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
 Use .text with no alignment for aout
 Only use 'hidden' visibility with Chromium
+Move '%use smartalign' for nasm out of 'INIT_CPUFLAGS' and before
+  'ALIGNMODE'.
diff --git a/libvpx/third_party/x86inc/x86inc.asm b/libvpx/third_party/x86inc/x86inc.asm
index 77a58f29..be59de31 100644
--- a/libvpx/third_party/x86inc/x86inc.asm
+++ b/libvpx/third_party/x86inc/x86inc.asm
@@ -876,6 +876,10 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
 
+%ifdef __NASM_VER__
+    %use smartalign
+%endif
+
 ; Takes an arbitrary number of cpuflags from the above list.
 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
@@ -912,7 +916,6 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
     %endif
 
     %ifdef __NASM_VER__
-        %use smartalign
         ALIGNMODE k7
     %elif ARCH_X86_64 || cpuflag(sse2)
         CPU amdnop
diff --git a/libvpx/vp8/common/arm/armv6/intra4x4_predict_v6.asm b/libvpx/vp8/common/arm/armv6/intra4x4_predict_v6.asm
deleted file mode 100644
index c5ec824b..00000000
--- a/libvpx/vp8/common/arm/armv6/intra4x4_predict_v6.asm
+++ /dev/null
@@ -1,611 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_intra4x4_predict_armv6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-
-;void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft,
-;                                B_PREDICTION_MODE left_stride, int b_mode,
-;                                unsigned char *dst, int dst_stride,
-;                                unsigned char top_left)
-
-; r0: *Above
-; r1: *yleft
-; r2: left_stride
-; r3: b_mode
-; sp + #40: dst
-; sp + #44: dst_stride
-; sp + #48: top_left
-|vp8_intra4x4_predict_armv6| PROC
-    push        {r4-r12, lr}
-
-    cmp         r3, #10
-    addlt       pc, pc, r3, lsl #2       ; position independent switch
-    pop         {r4-r12, pc}             ; default
-    b           b_dc_pred
-    b           b_tm_pred
-    b           b_ve_pred
-    b           b_he_pred
-    b           b_ld_pred
-    b           b_rd_pred
-    b           b_vr_pred
-    b           b_vl_pred
-    b           b_hd_pred
-    b           b_hu_pred
-
-b_dc_pred
-    ; load values
-    ldr         r8, [r0]                 ; Above
-    ldrb        r4, [r1], r2             ; Left[0]
-    mov         r9, #0
-    ldrb        r5, [r1], r2             ; Left[1]
-    ldrb        r6, [r1], r2             ; Left[2]
-    usad8       r12, r8, r9
-    ldrb        r7, [r1]                 ; Left[3]
-
-    ; calculate dc
-    add         r4, r4, r5
-    add         r4, r4, r6
-    add         r4, r4, r7
-    add         r4, r4, r12
-    add         r4, r4, #4
-    ldr         r0, [sp, #44]           ; dst_stride
-    mov         r12, r4, asr #3         ; (expected_dc + 4) >> 3
-
-    add         r12, r12, r12, lsl #8
-    ldr         r3, [sp, #40]           ; dst
-    add         r12, r12, r12, lsl #16
-
-    ; store values
-    str         r12, [r3], r0
-    str         r12, [r3], r0
-    str         r12, [r3], r0
-    str         r12, [r3]
-
-    pop        {r4-r12, pc}
-
-b_tm_pred
-    ldr         r8, [r0]                ; Above
-    ldrb        r9, [sp, #48]           ; top_left
-    ldrb        r4, [r1], r2            ; Left[0]
-    ldrb        r5, [r1], r2            ; Left[1]
-    ldrb        r6, [r1], r2            ; Left[2]
-    ldrb        r7, [r1]                ; Left[3]
-    ldr         r0, [sp, #44]           ; dst_stride
-    ldr         r3, [sp, #40]           ; dst
-
-    add         r9, r9, r9, lsl #16     ; [tl|tl]
-    uxtb16      r10, r8                 ; a[2|0]
-    uxtb16      r11, r8, ror #8         ; a[3|1]
-    ssub16      r10, r10, r9            ; a[2|0] - [tl|tl]
-    ssub16      r11, r11, r9            ; a[3|1] - [tl|tl]
-
-    add         r4, r4, r4, lsl #16     ; l[0|0]
-    add         r5, r5, r5, lsl #16     ; l[1|1]
-    add         r6, r6, r6, lsl #16     ; l[2|2]
-    add         r7, r7, r7, lsl #16     ; l[3|3]
-
-    sadd16      r1, r4, r10             ; l[0|0] + a[2|0] - [tl|tl]
-    sadd16      r2, r4, r11             ; l[0|0] + a[3|1] - [tl|tl]
-    usat16      r1, #8, r1
-    usat16      r2, #8, r2
-
-    sadd16      r4, r5, r10             ; l[1|1] + a[2|0] - [tl|tl]
-    sadd16      r5, r5, r11             ; l[1|1] + a[3|1] - [tl|tl]
-
-    add         r12, r1, r2, lsl #8     ; [3|2|1|0]
-    str         r12, [r3], r0
-
-    usat16      r4, #8, r4
-    usat16      r5, #8, r5
-
-    sadd16      r1, r6, r10             ; l[2|2] + a[2|0] - [tl|tl]
-    sadd16      r2, r6, r11             ; l[2|2] + a[3|1] - [tl|tl]
-
-    add         r12, r4, r5, lsl #8     ; [3|2|1|0]
-    str         r12, [r3], r0
-
-    usat16      r1, #8, r1
-    usat16      r2, #8, r2
-
-    sadd16      r4, r7, r10             ; l[3|3] + a[2|0] - [tl|tl]
-    sadd16      r5, r7, r11             ; l[3|3] + a[3|1] - [tl|tl]
-
-    add         r12, r1, r2, lsl #8     ; [3|2|1|0]
-
-    usat16      r4, #8, r4
-    usat16      r5, #8, r5
-
-    str         r12, [r3], r0
-
-    add         r12, r4, r5, lsl #8     ; [3|2|1|0]
-    str         r12, [r3]
-
-    pop        {r4-r12, pc}
-
-b_ve_pred
-    ldr         r8, [r0]                ; a[3|2|1|0]
-    ldr         r11, c00FF00FF
-    ldrb        r9, [sp, #48]           ; top_left
-    ldrb        r10, [r0, #4]           ; a[4]
-
-    ldr         r0, c00020002
-
-    uxtb16      r4, r8                  ; a[2|0]
-    uxtb16      r5, r8, ror #8          ; a[3|1]
-    ldr         r2, [sp, #44]           ; dst_stride
-    pkhbt       r9, r9, r5, lsl #16     ; a[1|-1]
-
-    add         r9, r9, r4, lsl #1      ;[a[1]+2*a[2]       | tl+2*a[0]       ]
-    uxtab16     r9, r9, r5              ;[a[1]+2*a[2]+a[3]  | tl+2*a[0]+a[1]  ]
-    ldr         r3, [sp, #40]           ; dst
-    uxtab16     r9, r9, r0              ;[a[1]+2*a[2]+a[3]+2| tl+2*a[0]+a[1]+2]
-
-    add         r0, r0, r10, lsl #16    ;[a[4]+2            |                 2]
-    add         r0, r0, r4, asr #16     ;[a[4]+2            |            a[2]+2]
-    add         r0, r0, r5, lsl #1      ;[a[4]+2*a[3]+2     |     a[2]+2*a[1]+2]
-    uadd16      r4, r4, r0              ;[a[4]+2*a[3]+a[2]+2|a[2]+2*a[1]+a[0]+2]
-
-    and         r9, r11, r9, asr #2
-    and         r4, r11, r4, asr #2
-    add         r9, r9, r4, lsl #8
-
-    ; store values
-    str         r9, [r3], r2
-    str         r9, [r3], r2
-    str         r9, [r3], r2
-    str         r9, [r3]
-
-    pop        {r4-r12, pc}
-
-
-b_he_pred
-    ldrb        r4, [r1], r2            ; Left[0]
-    ldrb        r8, [sp, #48]           ; top_left
-    ldrb        r5, [r1], r2            ; Left[1]
-    ldrb        r6, [r1], r2            ; Left[2]
-    ldrb        r7, [r1]                ; Left[3]
-
-    add         r8, r8, r4              ; tl   + l[0]
-    add         r9, r4, r5              ; l[0] + l[1]
-    add         r10, r5, r6             ; l[1] + l[2]
-    add         r11, r6, r7             ; l[2] + l[3]
-
-    mov         r0, #2<<14
-
-    add         r8, r8, r9              ; tl + 2*l[0] + l[1]
-    add         r4, r9, r10             ; l[0] + 2*l[1] + l[2]
-    add         r5, r10, r11            ; l[1] + 2*l[2] + l[3]
-    add         r6, r11, r7, lsl #1     ; l[2] + 2*l[3] + l[3]
-
-
-    add         r8, r0, r8, lsl #14     ; (tl + 2*l[0] + l[1])>>2 in top half
-    add         r9, r0, r4, lsl #14     ; (l[0] + 2*l[1] + l[2])>>2 in top half
-    add         r10,r0, r5, lsl #14     ; (l[1] + 2*l[2] + l[3])>>2 in top half
-    add         r11,r0, r6, lsl #14     ; (l[2] + 2*l[3] + l[3])>>2 in top half
-
-    pkhtb       r8, r8, r8, asr #16     ; l[-|0|-|0]
-    pkhtb       r9, r9, r9, asr #16     ; l[-|1|-|1]
-    pkhtb       r10, r10, r10, asr #16  ; l[-|2|-|2]
-    pkhtb       r11, r11, r11, asr #16  ; l[-|3|-|3]
-
-    ldr         r0, [sp, #44]           ; dst_stride
-    ldr         r3, [sp, #40]           ; dst
-
-    add         r8, r8, r8, lsl #8      ; l[0|0|0|0]
-    add         r9, r9, r9, lsl #8      ; l[1|1|1|1]
-    add         r10, r10, r10, lsl #8   ; l[2|2|2|2]
-    add         r11, r11, r11, lsl #8   ; l[3|3|3|3]
-
-    ; store values
-    str         r8, [r3], r0
-    str         r9, [r3], r0
-    str         r10, [r3], r0
-    str         r11, [r3]
-
-    pop        {r4-r12, pc}
-
-b_ld_pred
-    ldr         r4, [r0]                ; Above[0-3]
-    ldr         r12, c00020002
-    ldr         r5, [r0, #4]            ; Above[4-7]
-    ldr         lr,  c00FF00FF
-
-    uxtb16      r6, r4                  ; a[2|0]
-    uxtb16      r7, r4, ror #8          ; a[3|1]
-    uxtb16      r8, r5                  ; a[6|4]
-    uxtb16      r9, r5, ror #8          ; a[7|5]
-    pkhtb       r10, r6, r8             ; a[2|4]
-    pkhtb       r11, r7, r9             ; a[3|5]
-
-    add         r4, r6, r7, lsl #1      ; [a2+2*a3      |      a0+2*a1]
-    add         r4, r4, r10, ror #16    ; [a2+2*a3+a4   |   a0+2*a1+a2]
-    uxtab16     r4, r4, r12             ; [a2+2*a3+a4+2 | a0+2*a1+a2+2]
-
-    add         r5, r7, r10, ror #15    ; [a3+2*a4      |      a1+2*a2]
-    add         r5, r5, r11, ror #16    ; [a3+2*a4+a5   |   a1+2*a2+a3]
-    uxtab16     r5, r5, r12             ; [a3+2*a4+a5+2 | a1+2*a2+a3+2]
-
-    pkhtb       r7, r9, r8, asr #16
-    add         r6, r8, r9, lsl #1      ; [a6+2*a7      |      a4+2*a5]
-    uadd16      r6, r6, r7              ; [a6+2*a7+a7   |   a4+2*a5+a6]
-    uxtab16     r6, r6, r12             ; [a6+2*a7+a7+2 | a4+2*a5+a6+2]
-
-    uxth        r7, r9                  ; [                         a5]
-    add         r7, r7, r8, asr #15     ; [                    a5+2*a6]
-    add         r7, r7, r9, asr #16     ; [                 a5+2*a6+a7]
-    uxtah       r7, r7, r12             ; [               a5+2*a6+a7+2]
-
-    ldr         r0, [sp, #44]           ; dst_stride
-    ldr         r3, [sp, #40]           ; dst
-
-    ; scale down
-    and         r4, lr, r4, asr #2
-    and         r5, lr, r5, asr #2
-    and         r6, lr, r6, asr #2
-    mov         r7, r7, asr #2
-
-    add         r8, r4, r5, lsl #8      ; [3|2|1|0]
-    str         r8, [r3], r0
-
-    mov         r9, r8, lsr #8
-    add         r9, r9, r6, lsl #24     ; [4|3|2|1]
-    str         r9, [r3], r0
-
-    mov         r10, r9, lsr #8
-    add         r10, r10, r7, lsl #24   ; [5|4|3|2]
-    str         r10, [r3], r0
-
-    mov         r6, r6, lsr #16
-    mov         r11, r10, lsr #8
-    add         r11, r11, r6, lsl #24   ; [6|5|4|3]
-    str         r11, [r3]
-
-    pop        {r4-r12, pc}
-
-b_rd_pred
-    ldrb        r7, [r1], r2            ; l[0] = pp[3]
-    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
-    ldrb        r8, [sp, #48]           ; tl   = pp[4]
-    ldrb        r6, [r1], r2            ; l[1] = pp[2]
-    ldrb        r5, [r1], r2            ; l[2] = pp[1]
-    ldrb        r4, [r1], r2            ; l[3] = pp[0]
-
-
-    uxtb16      r9, lr                  ; p[7|5]
-    uxtb16      r10, lr, ror #8         ; p[8|6]
-    add         r4, r4, r6, lsl #16     ; p[2|0]
-    add         r5, r5, r7, lsl #16     ; p[3|1]
-    add         r6, r6, r8, lsl #16     ; p[4|2]
-    pkhbt       r7, r7, r9, lsl #16     ; p[5|3]
-    pkhbt       r8, r8, r10, lsl #16    ; p[6|4]
-
-    ldr         r12, c00020002
-    ldr         lr,  c00FF00FF
-
-    add         r4, r4, r5, lsl #1      ; [p2+2*p3      |      p0+2*p1]
-    add         r4, r4, r6              ; [p2+2*p3+p4   |   p0+2*p1+p2]
-    uxtab16     r4, r4, r12             ; [p2+2*p3+p4+2 | p0+2*p1+p2+2]
-
-    add         r5, r5, r6, lsl #1      ; [p3+2*p4      |      p1+2*p2]
-    add         r5, r5, r7              ; [p3+2*p4+p5   |   p1+2*p2+p3]
-    uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
-
-    add         r6, r7, r8, lsl #1      ; [p5+2*p6      |      p3+2*p4]
-    add         r6, r6, r9              ; [p5+2*p6+p7   |   p3+2*p4+p5]
-    uxtab16     r6, r6, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
-
-    add         r7, r8, r9, lsl #1      ; [p6+2*p7      |      p4+2*p5]
-    add         r7, r7, r10             ; [p6+2*p7+p8   |   p4+2*p5+p6]
-    uxtab16     r7, r7, r12             ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
-
-    ldr         r0, [sp, #44]           ; dst_stride
-    ldr         r3, [sp, #40]           ; dst
-
-    ; scale down
-    and         r7, lr, r7, asr #2
-    and         r6, lr, r6, asr #2
-    and         r5, lr, r5, asr #2
-    and         r4, lr, r4, asr #2
-
-    add         r8, r6, r7, lsl #8      ; [6|5|4|3]
-    str         r8, [r3], r0
-
-    mov         r9, r8, lsl #8          ; [5|4|3|-]
-    uxtab       r9, r9, r4, ror #16     ; [5|4|3|2]
-    str         r9, [r3], r0
-
-    mov         r10, r9, lsl #8         ; [4|3|2|-]
-    uxtab       r10, r10, r5            ; [4|3|2|1]
-    str         r10, [r3], r0
-
-    mov         r11, r10, lsl #8        ; [3|2|1|-]
-    uxtab       r11, r11, r4            ; [3|2|1|0]
-    str         r11, [r3]
-
-    pop        {r4-r12, pc}
-
-b_vr_pred
-    ldrb        r7, [r1], r2            ; l[0] = pp[3]
-    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
-    ldrb        r8, [sp, #48]           ; tl   = pp[4]
-    ldrb        r6, [r1], r2            ; l[1] = pp[2]
-    ldrb        r5, [r1], r2            ; l[2] = pp[1]
-    ldrb        r4, [r1]                ; l[3] = pp[0]
-
-    add         r5, r5, r7, lsl #16     ; p[3|1]
-    add         r6, r6, r8, lsl #16     ; p[4|2]
-    uxtb16      r9, lr                  ; p[7|5]
-    uxtb16      r10, lr, ror #8         ; p[8|6]
-    pkhbt       r7, r7, r9, lsl #16     ; p[5|3]
-    pkhbt       r8, r8, r10, lsl #16    ; p[6|4]
-
-    ldr         r4,  c00010001
-    ldr         r12, c00020002
-    ldr         lr,  c00FF00FF
-
-    add         r5, r5, r6, lsl #1      ; [p3+2*p4      |      p1+2*p2]
-    add         r5, r5, r7              ; [p3+2*p4+p5   |   p1+2*p2+p3]
-    uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
-
-    add         r6, r6, r7, lsl #1      ; [p4+2*p5      |      p2+2*p3]
-    add         r6, r6, r8              ; [p4+2*p5+p6   |   p2+2*p3+p4]
-    uxtab16     r6, r6, r12             ; [p4+2*p5+p6+2 | p2+2*p3+p4+2]
-
-    uadd16      r11, r8, r9             ; [p6+p7        |        p4+p5]
-    uhadd16     r11, r11, r4            ; [(p6+p7+1)>>1 | (p4+p5+1)>>1]
-                                        ; [F|E]
-
-    add         r7, r7, r8, lsl #1      ; [p5+2*p6      |      p3+2*p4]
-    add         r7, r7, r9              ; [p5+2*p6+p7   |   p3+2*p4+p5]
-    uxtab16     r7, r7, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
-
-    uadd16      r2, r9, r10             ; [p7+p8        |        p5+p6]
-    uhadd16     r2, r2, r4              ; [(p7+p8+1)>>1 | (p5+p6+1)>>1]
-                                        ; [J|I]
-
-    add         r8, r8, r9, lsl #1      ; [p6+2*p7      |      p4+2*p5]
-    add         r8, r8, r10             ; [p6+2*p7+p8   |   p4+2*p5+p6]
-    uxtab16     r8, r8, r12             ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
-
-    ldr         r0, [sp, #44]           ; dst_stride
-    ldr         r3, [sp, #40]           ; dst
-
-    ; scale down
-    and         r5, lr, r5, asr #2      ; [B|A]
-    and         r6, lr, r6, asr #2      ; [D|C]
-    and         r7, lr, r7, asr #2      ; [H|G]
-    and         r8, lr, r8, asr #2      ; [L|K]
-
-    add         r12, r11, r2, lsl #8    ; [J|F|I|E]
-    str         r12, [r3], r0
-
-    add         r12, r7, r8, lsl #8     ; [L|H|K|G]
-    str         r12, [r3], r0
-
-    pkhbt       r2, r6, r2, lsl #16     ; [-|I|-|C]
-    add         r2, r2, r11, lsl #8     ; [F|I|E|C]
-
-    pkhtb       r12, r6, r5             ; [-|D|-|A]
-    pkhtb       r10, r7, r5, asr #16    ; [-|H|-|B]
-    str         r2, [r3], r0
-    add         r12, r12, r10, lsl #8   ; [H|D|B|A]
-    str         r12, [r3]
-
-    pop        {r4-r12, pc}
-
-b_vl_pred
-    ldr         r4, [r0]                ; [3|2|1|0] = Above[0-3]
-    ldr         r12, c00020002
-    ldr         r5, [r0, #4]            ; [7|6|5|4] = Above[4-7]
-    ldr         lr,  c00FF00FF
-    ldr         r2,  c00010001
-
-    mov         r0, r4, lsr #16         ; [-|-|3|2]
-    add         r0, r0, r5, lsl #16     ; [5|4|3|2]
-    uxtb16      r6, r4                  ; [2|0]
-    uxtb16      r7, r4, ror #8          ; [3|1]
-    uxtb16      r8, r0                  ; [4|2]
-    uxtb16      r9, r0, ror #8          ; [5|3]
-    uxtb16      r10, r5                 ; [6|4]
-    uxtb16      r11, r5, ror #8         ; [7|5]
-
-    uadd16      r4, r6, r7              ; [p2+p3        |        p0+p1]
-    uhadd16     r4, r4, r2              ; [(p2+p3+1)>>1 | (p0+p1+1)>>1]
-                                        ; [B|A]
-
-    add         r5, r6, r7, lsl #1      ; [p2+2*p3      |      p0+2*p1]
-    add         r5, r5, r8              ; [p2+2*p3+p4   |   p0+2*p1+p2]
-    uxtab16     r5, r5, r12             ; [p2+2*p3+p4+2 | p0+2*p1+p2+2]
-
-    uadd16      r6, r7, r8              ; [p3+p4        |        p1+p2]
-    uhadd16     r6, r6, r2              ; [(p3+p4+1)>>1 | (p1+p2+1)>>1]
-                                        ; [F|E]
-
-    add         r7, r7, r8, lsl #1      ; [p3+2*p4      |      p1+2*p2]
-    add         r7, r7, r9              ; [p3+2*p4+p5   |   p1+2*p2+p3]
-    uxtab16     r7, r7, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
-
-    add         r8, r8, r9, lsl #1      ; [p4+2*p5      |      p2+2*p3]
-    add         r8, r8, r10             ; [p4+2*p5+p6   |   p2+2*p3+p4]
-    uxtab16     r8, r8, r12             ; [p4+2*p5+p6+2 | p2+2*p3+p4+2]
-
-    add         r9, r9, r10, lsl #1     ; [p5+2*p6      |      p3+2*p4]
-    add         r9, r9, r11             ; [p5+2*p6+p7   |   p3+2*p4+p5]
-    uxtab16     r9, r9, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
-
-    ldr         r0, [sp, #44]           ; dst_stride
-    ldr         r3, [sp, #40]           ; dst
-
-    ; scale down
-    and         r5, lr, r5, asr #2      ; [D|C]
-    and         r7, lr, r7, asr #2      ; [H|G]
-    and         r8, lr, r8, asr #2      ; [I|D]
-    and         r9, lr, r9, asr #2      ; [J|H]
-
-    add         r10, r4, r6, lsl #8     ; [F|B|E|A]
-    str         r10, [r3], r0
-
-    add         r5, r5, r7, lsl #8      ; [H|C|G|D]
-    str         r5, [r3], r0
-
-    pkhtb       r12, r8, r4, asr #16    ; [-|I|-|B]
-    pkhtb       r10, r9, r8             ; [-|J|-|D]
-
-    add         r12, r6, r12, lsl #8    ; [I|F|B|E]
-    str         r12, [r3], r0
-
-    add         r10, r7, r10, lsl #8    ; [J|H|D|G]
-    str         r10, [r3]
-
-    pop        {r4-r12, pc}
-
-b_hd_pred
-    ldrb        r7, [r1], r2            ; l[0] = pp[3]
-    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
-    ldrb        r8, [sp, #48]           ; tl   = pp[4]
-    ldrb        r6, [r1], r2            ; l[1] = pp[2]
-    ldrb        r5, [r1], r2            ; l[2] = pp[1]
-    ldrb        r4, [r1]                ; l[3] = pp[0]
-
-    uxtb16      r9, lr                  ; p[7|5]
-    uxtb16      r10, lr, ror #8         ; p[8|6]
-
-    add         r4, r4, r5, lsl #16     ; p[1|0]
-    add         r5, r5, r6, lsl #16     ; p[2|1]
-    add         r6, r6, r7, lsl #16     ; p[3|2]
-    add         r7, r7, r8, lsl #16     ; p[4|3]
-
-    ldr         r12, c00020002
-    ldr         lr,  c00FF00FF
-    ldr         r2,  c00010001
-
-    pkhtb       r8, r7, r9              ; p[4|5]
-    pkhtb       r1, r9, r10             ; p[7|6]
-    pkhbt       r10, r8, r10, lsl #16   ; p[6|5]
-
-    uadd16      r11, r4, r5             ; [p1+p2        |        p0+p1]
-    uhadd16     r11, r11, r2            ; [(p1+p2+1)>>1 | (p0+p1+1)>>1]
-                                        ; [B|A]
-
-    add         r4, r4, r5, lsl #1      ; [p1+2*p2      |      p0+2*p1]
-    add         r4, r4, r6              ; [p1+2*p2+p3   |   p0+2*p1+p2]
-    uxtab16     r4, r4, r12             ; [p1+2*p2+p3+2 | p0+2*p1+p2+2]
-
-    uadd16      r0, r6, r7              ; [p3+p4        |        p2+p3]
-    uhadd16     r0, r0, r2              ; [(p3+p4+1)>>1 | (p2+p3+1)>>1]
-                                        ; [F|E]
-
-    add         r5, r6, r7, lsl #1      ; [p3+2*p4      |      p2+2*p3]
-    add         r5, r5, r8, ror #16     ; [p3+2*p4+p5   |   p2+2*p3+p4]
-    uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p2+2*p3+p4+2]
-
-    add         r6, r12, r8, ror #16    ; [p5+2         |         p4+2]
-    add         r6, r6, r10, lsl #1     ; [p5+2+2*p6    |    p4+2+2*p5]
-    uxtab16     r6, r6, r1              ; [p5+2+2*p6+p7 | p4+2+2*p5+p6]
-
-    ; scale down
-    and         r4, lr, r4, asr #2      ; [D|C]
-    and         r5, lr, r5, asr #2      ; [H|G]
-    and         r6, lr, r6, asr #2      ; [J|I]
-
-    ldr         lr, [sp, #44]           ; dst_stride
-    ldr         r3, [sp, #40]           ; dst
-
-    pkhtb       r2, r0, r6              ; [-|F|-|I]
-    pkhtb       r12, r6, r5, asr #16    ; [-|J|-|H]
-    add         r12, r12, r2, lsl #8    ; [F|J|I|H]
-    add         r2, r0, r5, lsl #8      ; [H|F|G|E]
-    mov         r12, r12, ror #24       ; [J|I|H|F]
-    str         r12, [r3], lr
-
-    mov         r7, r11, asr #16        ; [-|-|-|B]
-    str         r2, [r3], lr
-    add         r7, r7, r0, lsl #16     ; [-|E|-|B]
-    add         r7, r7, r4, asr #8      ; [-|E|D|B]
-    add         r7, r7, r5, lsl #24     ; [G|E|D|B]
-    str         r7, [r3], lr
-
-    add         r5, r11, r4, lsl #8     ; [D|B|C|A]
-    str         r5, [r3]
-
-    pop        {r4-r12, pc}
-
-
-
-b_hu_pred
-    ldrb        r4, [r1], r2            ; Left[0]
-    ldr         r12, c00020002
-    ldrb        r5, [r1], r2            ; Left[1]
-    ldr         lr,  c00FF00FF
-    ldrb        r6, [r1], r2            ; Left[2]
-    ldr         r2,  c00010001
-    ldrb        r7, [r1]                ; Left[3]
-
-    add         r4, r4, r5, lsl #16     ; [1|0]
-    add         r5, r5, r6, lsl #16     ; [2|1]
-    add         r9, r6, r7, lsl #16     ; [3|2]
-
-    uadd16      r8, r4, r5              ; [p1+p2        |        p0+p1]
-    uhadd16     r8, r8, r2              ; [(p1+p2+1)>>1 | (p0+p1+1)>>1]
-                                        ; [B|A]
-
-    add         r4, r4, r5, lsl #1      ; [p1+2*p2      |      p0+2*p1]
-    add         r4, r4, r9              ; [p1+2*p2+p3   |   p0+2*p1+p2]
-    uxtab16     r4, r4, r12             ; [p1+2*p2+p3+2 | p0+2*p1+p2+2]
-    ldr         r2, [sp, #44]           ; dst_stride
-    ldr         r3, [sp, #40]           ; dst
-    and         r4, lr, r4, asr #2      ; [D|C]
-
-    add         r10, r6, r7             ; [p2+p3]
-    add         r11, r10, r7, lsl #1    ; [p2+3*p3]
-    add         r10, r10, #1
-    add         r11, r11, #2
-    mov         r10, r10, asr #1        ; [E]
-    mov         r11, r11, asr #2        ; [F]
-
-    add         r9, r7, r9, asr #8      ; [-|-|G|G]
-    add         r0, r8, r4, lsl #8      ; [D|B|C|A]
-    add         r7, r9, r9, lsl #16     ; [G|G|G|G]
-
-    str         r0, [r3], r2
-
-    mov         r1, r8, asr #16         ; [-|-|-|B]
-    add         r1, r1, r4, asr #8      ; [-|-|D|B]
-    add         r1, r1, r10, lsl #16    ; [-|E|D|B]
-    add         r1, r1, r11, lsl #24    ; [F|E|D|B]
-    str         r1, [r3], r2
-
-    add         r10, r11, lsl #8        ; [-|-|F|E]
-    add         r10, r10, r9, lsl #16   ; [G|G|F|E]
-    str         r10, [r3], r2
-
-    str         r7, [r3]
-
-    pop        {r4-r12, pc}
-
-    ENDP
-
-; constants
-c00010001
-    DCD         0x00010001
-c00020002
-    DCD         0x00020002
-c00FF00FF
-    DCD         0x00FF00FF
-
-    END
diff --git a/libvpx/vp8/common/arm/neon/reconintra_neon.c b/libvpx/vp8/common/arm/neon/reconintra_neon.c
deleted file mode 100644
index af52cd5e..00000000
--- a/libvpx/vp8/common/arm/neon/reconintra_neon.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "vp8/common/blockd.h"
-
-void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x,
-                                           unsigned char * yabove_row,
-                                           unsigned char * yleft,
-                                           int left_stride,
-                                           unsigned char * ypred_ptr,
-                                           int y_stride) {
-  const int mode = x->mode_info_context->mbmi.mode;
-  int i;
-
-  switch (mode) {
-    case DC_PRED:
-    {
-      int shift = x->up_available + x->left_available;
-      uint8x16_t v_expected_dc = vdupq_n_u8(128);
-
-      if (shift) {
-        unsigned int average = 0;
-        int expected_dc;
-        if (x->up_available) {
-          const uint8x16_t v_above = vld1q_u8(yabove_row);
-          const uint16x8_t a = vpaddlq_u8(v_above);
-          const uint32x4_t b = vpaddlq_u16(a);
-          const uint64x2_t c = vpaddlq_u32(b);
-          const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
-                                        vreinterpret_u32_u64(vget_high_u64(c)));
-          average = vget_lane_u32(d, 0);
-        }
-        if (x->left_available) {
-          for (i = 0; i < 16; ++i) {
-              average += yleft[0];
-              yleft += left_stride;
-          }
-        }
-        shift += 3;
-        expected_dc = (average + (1 << (shift - 1))) >> shift;
-        v_expected_dc = vmovq_n_u8((uint8_t)expected_dc);
-      }
-      for (i = 0; i < 16; ++i) {
-        vst1q_u8(ypred_ptr, v_expected_dc);
-        ypred_ptr += y_stride;
-      }
-    }
-    break;
-    case V_PRED:
-    {
-      const uint8x16_t v_above = vld1q_u8(yabove_row);
-      for (i = 0; i < 16; ++i) {
-        vst1q_u8(ypred_ptr, v_above);
-        ypred_ptr += y_stride;
-      }
-    }
-    break;
-    case H_PRED:
-    {
-      for (i = 0; i < 16; ++i) {
-        const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]);
-        yleft += left_stride;
-        vst1q_u8(ypred_ptr, v_yleft);
-        ypred_ptr += y_stride;
-      }
-    }
-    break;
-    case TM_PRED:
-    {
-      const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]);
-      const uint8x16_t v_above = vld1q_u8(yabove_row);
-      for (i = 0; i < 16; ++i) {
-        const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]);
-        const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft);
-        const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft);
-        const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo),
-                                         vreinterpretq_s16_u16(v_ytop_left));
-        const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi),
-                                         vreinterpretq_s16_u16(v_ytop_left));
-        const uint8x8_t pred_lo = vqmovun_s16(b_lo);
-        const uint8x8_t pred_hi = vqmovun_s16(b_hi);
-
-        vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi));
-        ypred_ptr += y_stride;
-        yleft += left_stride;
-      }
-    }
-    break;
-  }
-}
-
-void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x,
-                                            unsigned char * uabove_row,
-                                            unsigned char * vabove_row,
-                                            unsigned char * uleft,
-                                            unsigned char * vleft,
-                                            int left_stride,
-                                            unsigned char * upred_ptr,
-                                            unsigned char * vpred_ptr,
-                                            int pred_stride) {
-  const int mode = x->mode_info_context->mbmi.uv_mode;
-  int i;
-
-  switch (mode) {
-    case DC_PRED:
-    {
-      int shift = x->up_available + x->left_available;
-      uint8x8_t v_expected_udc = vdup_n_u8(128);
-      uint8x8_t v_expected_vdc = vdup_n_u8(128);
-
-      if (shift) {
-        unsigned int average_u = 0;
-        unsigned int average_v = 0;
-        int expected_udc;
-        int expected_vdc;
-        if (x->up_available) {
-          const uint8x8_t v_uabove = vld1_u8(uabove_row);
-          const uint8x8_t v_vabove = vld1_u8(vabove_row);
-          const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove));
-          const uint32x4_t b = vpaddlq_u16(a);
-          const uint64x2_t c = vpaddlq_u32(b);
-          average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0);
-          average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2);
-        }
-        if (x->left_available) {
-          for (i = 0; i < 8; ++i) {
-              average_u += uleft[0];
-              uleft += left_stride;
-              average_v += vleft[0];
-              vleft += left_stride;
-          }
-        }
-        shift += 2;
-        expected_udc = (average_u + (1 << (shift - 1))) >> shift;
-        expected_vdc = (average_v + (1 << (shift - 1))) >> shift;
-        v_expected_udc = vmov_n_u8((uint8_t)expected_udc);
-        v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc);
-      }
-      for (i = 0; i < 8; ++i) {
-        vst1_u8(upred_ptr, v_expected_udc);
-        upred_ptr += pred_stride;
-        vst1_u8(vpred_ptr, v_expected_vdc);
-        vpred_ptr += pred_stride;
-      }
-    }
-    break;
-    case V_PRED:
-    {
-      const uint8x8_t v_uabove = vld1_u8(uabove_row);
-      const uint8x8_t v_vabove = vld1_u8(vabove_row);
-      for (i = 0; i < 8; ++i) {
-        vst1_u8(upred_ptr, v_uabove);
-        upred_ptr += pred_stride;
-        vst1_u8(vpred_ptr, v_vabove);
-        vpred_ptr += pred_stride;
-      }
-    }
-    break;
-    case H_PRED:
-    {
-      for (i = 0; i < 8; ++i) {
-        const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]);
-        const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]);
-        uleft += left_stride;
-        vleft += left_stride;
-        vst1_u8(upred_ptr, v_uleft);
-        upred_ptr += pred_stride;
-        vst1_u8(vpred_ptr, v_vleft);
-        vpred_ptr += pred_stride;
-      }
-    }
-    break;
-    case TM_PRED:
-    {
-      const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]);
-      const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]);
-      const uint8x8_t v_uabove = vld1_u8(uabove_row);
-      const uint8x8_t v_vabove = vld1_u8(vabove_row);
-      for (i = 0; i < 8; ++i) {
-        const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]);
-        const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]);
-        const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft);
-        const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft);
-        const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u),
-                                        vreinterpretq_s16_u16(v_utop_left));
-        const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v),
-                                        vreinterpretq_s16_u16(v_vtop_left));
-        const uint8x8_t pred_u = vqmovun_s16(b_u);
-        const uint8x8_t pred_v = vqmovun_s16(b_v);
-
-        vst1_u8(upred_ptr, pred_u);
-        vst1_u8(vpred_ptr, pred_v);
-        upred_ptr += pred_stride;
-        vpred_ptr += pred_stride;
-        uleft += left_stride;
-        vleft += left_stride;
-      }
-    }
-    break;
-  }
-}
diff --git a/libvpx/vp8/common/common.h b/libvpx/vp8/common/common.h
index ba3d9f54..e58a9cc2 100644
--- a/libvpx/vp8/common/common.h
+++ b/libvpx/vp8/common/common.h
@@ -22,9 +22,6 @@
 extern "C" {
 #endif
 
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-
 /* Only need this for fixed-size arrays, for structs just assign. */
 
 #define vp8_copy( Dest, Src) { \
diff --git a/libvpx/vp8/common/findnearmv.h b/libvpx/vp8/common/findnearmv.h
index 3c8c0506..155847ca 100644
--- a/libvpx/vp8/common/findnearmv.h
+++ b/libvpx/vp8/common/findnearmv.h
@@ -12,6 +12,7 @@
 #ifndef VP8_COMMON_FINDNEARMV_H_
 #define VP8_COMMON_FINDNEARMV_H_
 
+#include "./vpx_config.h"
 #include "mv.h"
 #include "blockd.h"
 #include "modecont.h"
@@ -22,8 +23,8 @@ extern "C" {
 #endif
 
 
-static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp,
-                    const int *ref_frame_sign_bias)
+static INLINE void mv_bias(int refmb_ref_frame_sign_bias, int refframe,
+                           int_mv *mvp, const int *ref_frame_sign_bias)
 {
     if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
     {
@@ -34,7 +35,7 @@ static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp,
 
 #define LEFT_TOP_MARGIN (16 << 3)
 #define RIGHT_BOTTOM_MARGIN (16 << 3)
-static void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd)
+static INLINE void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd)
 {
     if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
         mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
@@ -47,8 +48,9 @@ static void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd)
         mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
 }
 
-static void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, int mb_to_right_edge,
-                         int mb_to_top_edge, int mb_to_bottom_edge)
+static INLINE void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge,
+                                int mb_to_right_edge, int mb_to_top_edge,
+                                int mb_to_bottom_edge)
 {
     mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
         mb_to_left_edge : mv->as_mv.col;
@@ -59,9 +61,10 @@ static void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, int mb_to_right_edge,
     mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
         mb_to_bottom_edge : mv->as_mv.row;
 }
-static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
-                                int mb_to_right_edge, int mb_to_top_edge,
-                                int mb_to_bottom_edge)
+static INLINE unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
+                                               int mb_to_right_edge,
+                                               int mb_to_top_edge,
+                                               int mb_to_bottom_edge)
 {
     unsigned int need_to_clamp;
     need_to_clamp = (mv->as_mv.col < mb_to_left_edge);
@@ -101,7 +104,7 @@ vp8_prob *vp8_mv_ref_probs(
 extern const unsigned char vp8_mbsplit_offset[4][16];
 
 
-static int left_block_mv(const MODE_INFO *cur_mb, int b)
+static INLINE int left_block_mv(const MODE_INFO *cur_mb, int b)
 {
     if (!(b & 3))
     {
@@ -116,7 +119,7 @@ static int left_block_mv(const MODE_INFO *cur_mb, int b)
     return (cur_mb->bmi + b - 1)->mv.as_int;
 }
 
-static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
+static INLINE int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
 {
     if (!(b >> 2))
     {
@@ -130,7 +133,7 @@ static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
 
     return (cur_mb->bmi + (b - 4))->mv.as_int;
 }
-static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
+static INLINE B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
 {
     if (!(b & 3))
     {
@@ -156,7 +159,8 @@ static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
     return (cur_mb->bmi + b - 1)->as_mode;
 }
 
-static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, int mi_stride)
+static INLINE B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b,
+                                                 int mi_stride)
 {
     if (!(b >> 2))
     {
diff --git a/libvpx/vp8/common/invtrans.h b/libvpx/vp8/common/invtrans.h
index affe57e3..9cfea8d5 100644
--- a/libvpx/vp8/common/invtrans.h
+++ b/libvpx/vp8/common/invtrans.h
@@ -12,7 +12,7 @@
 #ifndef VP8_COMMON_INVTRANS_H_
 #define VP8_COMMON_INVTRANS_H_
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp8_rtcd.h"
 #include "blockd.h"
 #include "onyxc_int.h"
@@ -37,7 +37,7 @@ static void eob_adjust(char *eobs, short *diff)
     }
 }
 
-static void vp8_inverse_transform_mby(MACROBLOCKD *xd)
+static INLINE void vp8_inverse_transform_mby(MACROBLOCKD *xd)
 {
     short *DQC = xd->dequant_y1;
 
diff --git a/libvpx/vp8/common/mips/msa/reconintra_msa.c b/libvpx/vp8/common/mips/msa/reconintra_msa.c
deleted file mode 100644
index 57f705d2..00000000
--- a/libvpx/vp8/common/mips/msa/reconintra_msa.c
+++ /dev/null
@@ -1,342 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp8_rtcd.h"
-#include "vp8/common/blockd.h"
-#include "vp8/common/mips/msa/vp8_macros_msa.h"
-
-static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst,
-                                       int32_t dst_stride)
-{
-    uint64_t out = LD(src);
-
-    SD4(out, out, out, out, dst, dst_stride);
-    dst += (4 * dst_stride);
-    SD4(out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst,
-                                         int32_t dst_stride)
-{
-    v16u8 out = LD_UB(src);
-
-    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-    dst += (8 * dst_stride);
-    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride,
-                                        uint8_t *dst, int32_t dst_stride)
-{
-    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
-
-    out0 = src[0 * src_stride] * 0x0101010101010101ull;
-    out1 = src[1 * src_stride] * 0x0101010101010101ull;
-    out2 = src[2 * src_stride] * 0x0101010101010101ull;
-    out3 = src[3 * src_stride] * 0x0101010101010101ull;
-    out4 = src[4 * src_stride] * 0x0101010101010101ull;
-    out5 = src[5 * src_stride] * 0x0101010101010101ull;
-    out6 = src[6 * src_stride] * 0x0101010101010101ull;
-    out7 = src[7 * src_stride] * 0x0101010101010101ull;
-
-    SD4(out0, out1, out2, out3, dst, dst_stride);
-    dst += (4 * dst_stride);
-    SD4(out4, out5, out6, out7, dst, dst_stride);
-}
-
-static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride,
-                                          uint8_t *dst, int32_t dst_stride)
-{
-    uint32_t row;
-    uint8_t inp0, inp1, inp2, inp3;
-    v16u8 src0, src1, src2, src3;
-
-    for (row = 4; row--;)
-    {
-        inp0 = src[0];
-        src += src_stride;
-        inp1 = src[0];
-        src += src_stride;
-        inp2 = src[0];
-        src += src_stride;
-        inp3 = src[0];
-        src += src_stride;
-
-        src0 = (v16u8)__msa_fill_b(inp0);
-        src1 = (v16u8)__msa_fill_b(inp1);
-        src2 = (v16u8)__msa_fill_b(inp2);
-        src3 = (v16u8)__msa_fill_b(inp3);
-
-        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-        dst += (4 * dst_stride);
-    }
-}
-
-static void intra_predict_dc_8x8_msa(uint8_t *src_top, uint8_t *src_left,
-                                     int32_t src_stride_left,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     uint8_t is_above, uint8_t is_left)
-{
-    uint32_t row, addition = 0;
-    uint64_t out;
-    v16u8 src_above, store;
-    v8u16 sum_above;
-    v4u32 sum_top;
-    v2u64 sum;
-
-    if (is_left && is_above)
-    {
-        src_above = LD_UB(src_top);
-
-        sum_above = __msa_hadd_u_h(src_above, src_above);
-        sum_top = __msa_hadd_u_w(sum_above, sum_above);
-        sum = __msa_hadd_u_d(sum_top, sum_top);
-        addition = __msa_copy_u_w((v4i32)sum, 0);
-
-        for (row = 0; row < 8; ++row)
-        {
-            addition += src_left[row * src_stride_left];
-        }
-
-        addition = (addition + 8) >> 4;
-        store = (v16u8)__msa_fill_b(addition);
-    }
-    else if (is_left)
-    {
-        for (row = 0; row < 8; ++row)
-        {
-            addition += src_left[row * src_stride_left];
-        }
-
-        addition = (addition + 4) >> 3;
-        store = (v16u8)__msa_fill_b(addition);
-    }
-    else if (is_above)
-    {
-        src_above = LD_UB(src_top);
-
-        sum_above = __msa_hadd_u_h(src_above, src_above);
-        sum_top = __msa_hadd_u_w(sum_above, sum_above);
-        sum = __msa_hadd_u_d(sum_top, sum_top);
-        sum = (v2u64)__msa_srari_d((v2i64)sum, 3);
-        store = (v16u8)__msa_splati_b((v16i8)sum, 0);
-    }
-    else
-    {
-        store = (v16u8)__msa_ldi_b(128);
-    }
-
-    out = __msa_copy_u_d((v2i64)store, 0);
-
-    SD4(out, out, out, out, dst, dst_stride);
-    dst += (4 * dst_stride);
-    SD4(out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_16x16_msa(uint8_t *src_top, uint8_t *src_left,
-                                       int32_t src_stride_left,
-                                       uint8_t *dst, int32_t dst_stride,
-                                       uint8_t is_above, uint8_t is_left)
-{
-    uint32_t row;
-    uint32_t addition = 0;
-    v16u8 src_above, out;
-    v8u16 sum_above;
-    v4u32 sum_top;
-    v2u64 sum;
-
-    if (is_left && is_above)
-    {
-        src_above = LD_UB(src_top);
-
-        sum_above = __msa_hadd_u_h(src_above, src_above);
-        sum_top = __msa_hadd_u_w(sum_above, sum_above);
-        sum = __msa_hadd_u_d(sum_top, sum_top);
-        sum_top = (v4u32)__msa_pckev_w((v4i32)sum, (v4i32)sum);
-        sum = __msa_hadd_u_d(sum_top, sum_top);
-        addition = __msa_copy_u_w((v4i32)sum, 0);
-
-        for (row = 0; row < 16; ++row)
-        {
-            addition += src_left[row * src_stride_left];
-        }
-
-        addition = (addition + 16) >> 5;
-        out = (v16u8)__msa_fill_b(addition);
-    }
-    else if (is_left)
-    {
-        for (row = 0; row < 16; ++row)
-        {
-            addition += src_left[row * src_stride_left];
-        }
-
-        addition = (addition + 8) >> 4;
-        out = (v16u8)__msa_fill_b(addition);
-    }
-    else if (is_above)
-    {
-        src_above = LD_UB(src_top);
-
-        sum_above = __msa_hadd_u_h(src_above, src_above);
-        sum_top = __msa_hadd_u_w(sum_above, sum_above);
-        sum = __msa_hadd_u_d(sum_top, sum_top);
-        sum_top = (v4u32)__msa_pckev_w((v4i32)sum, (v4i32)sum);
-        sum = __msa_hadd_u_d(sum_top, sum_top);
-        sum = (v2u64)__msa_srari_d((v2i64)sum, 4);
-        out = (v16u8)__msa_splati_b((v16i8)sum, 0);
-    }
-    else
-    {
-        out = (v16u8)__msa_ldi_b(128);
-    }
-
-    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-    dst += (8 * dst_stride);
-    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-void vp8_build_intra_predictors_mby_s_msa(struct macroblockd *x,
-                                          unsigned char *yabove_row,
-                                          unsigned char *yleft,
-                                          int left_stride,
-                                          unsigned char *ypred_ptr,
-                                          int y_stride)
-{
-    uint32_t row, col;
-    uint8_t ytop_left = yabove_row[-1];
-
-    switch (x->mode_info_context->mbmi.mode)
-    {
-        case DC_PRED:
-            intra_predict_dc_16x16_msa(yabove_row, yleft, left_stride,
-                                       ypred_ptr, y_stride,
-                                       x->up_available, x->left_available);
-            break;
-
-        case V_PRED:
-            intra_predict_vert_16x16_msa(yabove_row, ypred_ptr, y_stride);
-            break;
-
-        case H_PRED:
-            intra_predict_horiz_16x16_msa(yleft, left_stride, ypred_ptr,
-                                          y_stride);
-            break;
-
-        case TM_PRED:
-            for (row = 0; row < 16; ++row)
-            {
-                for (col = 0; col < 16; ++col)
-                {
-                    int pred = yleft[row * left_stride] + yabove_row[col] -
-                               ytop_left;
-
-                    if (pred < 0)
-                        pred = 0;
-
-                    if (pred > 255)
-                        pred = 255;
-
-                    ypred_ptr[col] = pred;
-                }
-
-                ypred_ptr += y_stride;
-            }
-            break;
-
-        case B_PRED:
-        case NEARESTMV:
-        case NEARMV:
-        case ZEROMV:
-        case NEWMV:
-        case SPLITMV:
-        case MB_MODE_COUNT:
-            break;
-    }
-}
-
-void vp8_build_intra_predictors_mbuv_s_msa(struct macroblockd *x,
-                                           unsigned char *uabove_row,
-                                           unsigned char *vabove_row,
-                                           unsigned char *uleft,
-                                           unsigned char *vleft,
-                                           int left_stride,
-                                           unsigned char *upred_ptr,
-                                           unsigned char *vpred_ptr,
-                                           int pred_stride)
-{
-    uint32_t row, col;
-    uint8_t utop_left = uabove_row[-1];
-    uint8_t vtop_left = vabove_row[-1];
-
-    switch (x->mode_info_context->mbmi.uv_mode)
-    {
-        case DC_PRED:
-            intra_predict_dc_8x8_msa(uabove_row, uleft, left_stride,
-                                     upred_ptr, pred_stride,
-                                     x->up_available, x->left_available);
-            intra_predict_dc_8x8_msa(vabove_row, vleft, left_stride,
-                                     vpred_ptr, pred_stride,
-                                     x->up_available, x->left_available);
-            break;
-
-        case V_PRED:
-            intra_predict_vert_8x8_msa(uabove_row, upred_ptr, pred_stride);
-            intra_predict_vert_8x8_msa(vabove_row, vpred_ptr, pred_stride);
-            break;
-
-        case H_PRED:
-            intra_predict_horiz_8x8_msa(uleft, left_stride, upred_ptr,
-                                        pred_stride);
-            intra_predict_horiz_8x8_msa(vleft, left_stride, vpred_ptr,
-                                        pred_stride);
-            break;
-
-        case TM_PRED:
-            for (row = 0; row < 8; ++row)
-            {
-                for (col = 0; col < 8; ++col)
-                {
-                    int predu = uleft[row * left_stride] + uabove_row[col] -
-                                utop_left;
-                    int predv = vleft[row * left_stride] + vabove_row[col] -
-                                vtop_left;
-
-                    if (predu < 0)
-                        predu = 0;
-
-                    if (predu > 255)
-                        predu = 255;
-
-                    if (predv < 0)
-                        predv = 0;
-
-                    if (predv > 255)
-                        predv = 255;
-
-                    upred_ptr[col] = predu;
-                    vpred_ptr[col] = predv;
-                }
-
-                upred_ptr += pred_stride;
-                vpred_ptr += pred_stride;
-            }
-            break;
-
-        case B_PRED:
-        case NEARESTMV:
-        case NEARMV:
-        case ZEROMV:
-        case NEWMV:
-        case SPLITMV:
-        case MB_MODE_COUNT:
-            break;
-    }
-}
diff --git a/libvpx/vp8/common/onyx.h b/libvpx/vp8/common/onyx.h
index f39b675c..febe8150 100644
--- a/libvpx/vp8/common/onyx.h
+++ b/libvpx/vp8/common/onyx.h
@@ -65,7 +65,7 @@ extern "C"
 
 
 #include <assert.h>
-    static void Scale2Ratio(int mode, int *hr, int *hs)
+    static INLINE void Scale2Ratio(int mode, int *hr, int *hs)
     {
         switch (mode)
         {
diff --git a/libvpx/vp8/common/postproc.c b/libvpx/vp8/common/postproc.c
index a4e6ae17..322b6138 100644
--- a/libvpx/vp8/common/postproc.c
+++ b/libvpx/vp8/common/postproc.c
@@ -675,6 +675,7 @@ void vp8_blend_b_c (unsigned char *y, unsigned char *u, unsigned char *v,
     }
 }
 
+#if CONFIG_POSTPROC_VISUALIZER
 static void constrain_line (int x_0, int *x_1, int y_0, int *y_1, int width, int height)
 {
     int dx;
@@ -717,6 +718,7 @@ static void constrain_line (int x_0, int *x_1, int y_0, int *y_1, int width, int
             *x_1 = ((0-y_0)*dx)/dy + x_0;
     }
 }
+#endif  // CONFIG_POSTPROC_VISUALIZER
 
 #if CONFIG_POSTPROC
 int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
diff --git a/libvpx/vp8/common/reconintra.c b/libvpx/vp8/common/reconintra.c
index 0a6c51b3..356655da 100644
--- a/libvpx/vp8/common/reconintra.c
+++ b/libvpx/vp8/common/reconintra.c
@@ -9,272 +9,109 @@
  */
 
 
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vp8_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
 #include "blockd.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/reconintra4x4.h"
 
-void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x,
-                                          unsigned char * yabove_row,
-                                          unsigned char * yleft,
-                                          int left_stride,
-                                          unsigned char * ypred_ptr,
-                                          int y_stride)
+enum {
+    SIZE_16,
+    SIZE_8,
+    NUM_SIZES,
+};
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[4][NUM_SIZES];
+static intra_pred_fn dc_pred[2][2][NUM_SIZES];
+
+static void vp8_init_intra_predictors_internal(void)
 {
-    unsigned char yleft_col[16];
-    unsigned char ytop_left = yabove_row[-1];
-    int r, c, i;
+#define INIT_SIZE(sz) \
+    pred[V_PRED][SIZE_##sz] = vpx_v_predictor_##sz##x##sz; \
+    pred[H_PRED][SIZE_##sz] = vpx_h_predictor_##sz##x##sz; \
+    pred[TM_PRED][SIZE_##sz] = vpx_tm_predictor_##sz##x##sz; \
+ \
+    dc_pred[0][0][SIZE_##sz] = vpx_dc_128_predictor_##sz##x##sz; \
+    dc_pred[0][1][SIZE_##sz] = vpx_dc_top_predictor_##sz##x##sz; \
+    dc_pred[1][0][SIZE_##sz] = vpx_dc_left_predictor_##sz##x##sz; \
+    dc_pred[1][1][SIZE_##sz] = vpx_dc_predictor_##sz##x##sz
+
+    INIT_SIZE(16);
+    INIT_SIZE(8);
+    vp8_init_intra4x4_predictors_internal();
+}
+
+void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x,
+                                      unsigned char * yabove_row,
+                                      unsigned char * yleft,
+                                      int left_stride,
+                                      unsigned char * ypred_ptr,
+                                      int y_stride)
+{
+    MB_PREDICTION_MODE mode = x->mode_info_context->mbmi.mode;
+    DECLARE_ALIGNED(16, uint8_t, yleft_col[16]);
+    int i;
+    intra_pred_fn fn;
 
     for (i = 0; i < 16; i++)
     {
         yleft_col[i] = yleft[i* left_stride];
     }
 
-    /* for Y */
-    switch (x->mode_info_context->mbmi.mode)
-    {
-    case DC_PRED:
-    {
-        int expected_dc;
-        int shift;
-        int average = 0;
-
-
-        if (x->up_available || x->left_available)
-        {
-            if (x->up_available)
-            {
-                for (i = 0; i < 16; i++)
-                {
-                    average += yabove_row[i];
-                }
-            }
-
-            if (x->left_available)
-            {
-
-                for (i = 0; i < 16; i++)
-                {
-                    average += yleft_col[i];
-                }
-
-            }
-
-
-
-            shift = 3 + x->up_available + x->left_available;
-            expected_dc = (average + (1 << (shift - 1))) >> shift;
-        }
-        else
-        {
-            expected_dc = 128;
-        }
-
-        /*memset(ypred_ptr, expected_dc, 256);*/
-        for (r = 0; r < 16; r++)
-        {
-            memset(ypred_ptr, expected_dc, 16);
-            ypred_ptr += y_stride;
-        }
-    }
-    break;
-    case V_PRED:
+    if (mode == DC_PRED)
     {
-
-        for (r = 0; r < 16; r++)
-        {
-
-            ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
-            ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
-            ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
-            ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
-            ypred_ptr += y_stride;
-        }
+        fn = dc_pred[x->left_available][x->up_available][SIZE_16];
     }
-    break;
-    case H_PRED:
+    else
     {
-
-        for (r = 0; r < 16; r++)
-        {
-
-            memset(ypred_ptr, yleft_col[r], 16);
-            ypred_ptr += y_stride;
-        }
-
+        fn = pred[mode][SIZE_16];
     }
-    break;
-    case TM_PRED:
-    {
-
-        for (r = 0; r < 16; r++)
-        {
-            for (c = 0; c < 16; c++)
-            {
-                int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
-
-                if (pred < 0)
-                    pred = 0;
 
-                if (pred > 255)
-                    pred = 255;
-
-                ypred_ptr[c] = pred;
-            }
-
-            ypred_ptr += y_stride;
-        }
-
-    }
-    break;
-    case B_PRED:
-    case NEARESTMV:
-    case NEARMV:
-    case ZEROMV:
-    case NEWMV:
-    case SPLITMV:
-    case MB_MODE_COUNT:
-        break;
-    }
+    fn(ypred_ptr, y_stride, yabove_row, yleft_col);
 }
 
-void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
-                                         unsigned char * uabove_row,
-                                         unsigned char * vabove_row,
-                                         unsigned char * uleft,
-                                         unsigned char * vleft,
-                                         int left_stride,
-                                         unsigned char * upred_ptr,
-                                         unsigned char * vpred_ptr,
-                                         int pred_stride)
+void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x,
+                                       unsigned char * uabove_row,
+                                       unsigned char * vabove_row,
+                                       unsigned char * uleft,
+                                       unsigned char * vleft,
+                                       int left_stride,
+                                       unsigned char * upred_ptr,
+                                       unsigned char * vpred_ptr,
+                                       int pred_stride)
 {
+    MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode;
     unsigned char uleft_col[8];
-    unsigned char utop_left = uabove_row[-1];
     unsigned char vleft_col[8];
-    unsigned char vtop_left = vabove_row[-1];
-
-    int i, j;
+    int i;
+    intra_pred_fn fn;
 
     for (i = 0; i < 8; i++)
     {
-        uleft_col[i] = uleft [i* left_stride];
-        vleft_col[i] = vleft [i* left_stride];
+        uleft_col[i] = uleft[i * left_stride];
+        vleft_col[i] = vleft[i * left_stride];
     }
 
-    switch (x->mode_info_context->mbmi.uv_mode)
-    {
-    case DC_PRED:
-    {
-        int expected_udc;
-        int expected_vdc;
-        int shift;
-        int Uaverage = 0;
-        int Vaverage = 0;
-
-        if (x->up_available)
-        {
-            for (i = 0; i < 8; i++)
-            {
-                Uaverage += uabove_row[i];
-                Vaverage += vabove_row[i];
-            }
-        }
-
-        if (x->left_available)
-        {
-            for (i = 0; i < 8; i++)
-            {
-                Uaverage += uleft_col[i];
-                Vaverage += vleft_col[i];
-            }
-        }
-
-        if (!x->up_available && !x->left_available)
-        {
-            expected_udc = 128;
-            expected_vdc = 128;
-        }
-        else
-        {
-            shift = 2 + x->up_available + x->left_available;
-            expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
-            expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
-        }
-
-
-        /*memset(upred_ptr,expected_udc,64);*/
-        /*memset(vpred_ptr,expected_vdc,64);*/
-        for (i = 0; i < 8; i++)
-        {
-            memset(upred_ptr, expected_udc, 8);
-            memset(vpred_ptr, expected_vdc, 8);
-            upred_ptr += pred_stride;
-            vpred_ptr += pred_stride;
-        }
-    }
-    break;
-    case V_PRED:
+    if (uvmode == DC_PRED)
     {
-        for (i = 0; i < 8; i++)
-        {
-            memcpy(upred_ptr, uabove_row, 8);
-            memcpy(vpred_ptr, vabove_row, 8);
-            upred_ptr += pred_stride;
-            vpred_ptr += pred_stride;
-        }
-
+        fn = dc_pred[x->left_available][x->up_available][SIZE_8];
     }
-    break;
-    case H_PRED:
+    else
     {
-        for (i = 0; i < 8; i++)
-        {
-            memset(upred_ptr, uleft_col[i], 8);
-            memset(vpred_ptr, vleft_col[i], 8);
-            upred_ptr += pred_stride;
-            vpred_ptr += pred_stride;
-        }
+        fn = pred[uvmode][SIZE_8];
     }
 
-    break;
-    case TM_PRED:
-    {
-        for (i = 0; i < 8; i++)
-        {
-            for (j = 0; j < 8; j++)
-            {
-                int predu = uleft_col[i] + uabove_row[j] - utop_left;
-                int predv = vleft_col[i] + vabove_row[j] - vtop_left;
-
-                if (predu < 0)
-                    predu = 0;
-
-                if (predu > 255)
-                    predu = 255;
-
-                if (predv < 0)
-                    predv = 0;
-
-                if (predv > 255)
-                    predv = 255;
-
-                upred_ptr[j] = predu;
-                vpred_ptr[j] = predv;
-            }
-
-            upred_ptr += pred_stride;
-            vpred_ptr += pred_stride;
-        }
+    fn(upred_ptr, pred_stride, uabove_row, uleft_col);
+    fn(vpred_ptr, pred_stride, vabove_row, vleft_col);
+}
 
-    }
-    break;
-    case B_PRED:
-    case NEARESTMV:
-    case NEARMV:
-    case ZEROMV:
-    case NEWMV:
-    case SPLITMV:
-    case MB_MODE_COUNT:
-        break;
-    }
+void vp8_init_intra_predictors(void)
+{
+    once(vp8_init_intra_predictors_internal);
 }
diff --git a/libvpx/vp8/common/reconintra.h b/libvpx/vp8/common/reconintra.h
new file mode 100644
index 00000000..b6225a66
--- /dev/null
+++ b/libvpx/vp8/common/reconintra.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_RECONINTRA_H_
+#define VP8_COMMON_RECONINTRA_H_
+
+#include "vp8/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x,
+                                      unsigned char *yabove_row,
+                                      unsigned char *yleft,
+                                      int left_stride,
+                                      unsigned char *ypred_ptr,
+                                      int y_stride);
+
+void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x,
+                                       unsigned char * uabove_row,
+                                       unsigned char * vabove_row,
+                                       unsigned char * uleft,
+                                       unsigned char * vleft,
+                                       int left_stride,
+                                       unsigned char * upred_ptr,
+                                       unsigned char * vpred_ptr,
+                                       int pred_stride);
+
+void vp8_init_intra_predictors(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_RECONINTRA_H_
diff --git a/libvpx/vp8/common/reconintra4x4.c b/libvpx/vp8/common/reconintra4x4.c
index 3d4f2c40..35ad891e 100644
--- a/libvpx/vp8/common/reconintra4x4.c
+++ b/libvpx/vp8/common/reconintra4x4.c
@@ -8,290 +8,47 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <string.h>
 
 #include "vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp8_rtcd.h"
 #include "blockd.h"
 
-void vp8_intra4x4_predict_c(unsigned char *Above,
-                            unsigned char *yleft, int left_stride,
-                            int           _b_mode,
-                            unsigned char *dst, int dst_stride,
-                            unsigned char top_left)
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[10];
+
+void vp8_init_intra4x4_predictors_internal(void)
+{
+    pred[B_DC_PRED] = vpx_dc_predictor_4x4;
+    pred[B_TM_PRED] = vpx_tm_predictor_4x4;
+    pred[B_VE_PRED] = vpx_ve_predictor_4x4;
+    pred[B_HE_PRED] = vpx_he_predictor_4x4;
+    pred[B_LD_PRED] = vpx_d45e_predictor_4x4;
+    pred[B_RD_PRED] = vpx_d135_predictor_4x4;
+    pred[B_VR_PRED] = vpx_d117_predictor_4x4;
+    pred[B_VL_PRED] = vpx_d63f_predictor_4x4;
+    pred[B_HD_PRED] = vpx_d153_predictor_4x4;
+    pred[B_HU_PRED] = vpx_d207_predictor_4x4;
+}
+
+void vp8_intra4x4_predict(unsigned char *above,
+                          unsigned char *yleft, int left_stride,
+                          B_PREDICTION_MODE b_mode,
+                          unsigned char *dst, int dst_stride,
+                          unsigned char top_left)
 {
-    int i, r, c;
-    B_PREDICTION_MODE b_mode = (B_PREDICTION_MODE)_b_mode;
     unsigned char Left[4];
+    unsigned char Aboveb[12], *Above = Aboveb + 4;
+
     Left[0] = yleft[0];
     Left[1] = yleft[left_stride];
     Left[2] = yleft[2 * left_stride];
     Left[3] = yleft[3 * left_stride];
+    memcpy(Above, above, 8);
+    Above[-1] = top_left;
 
-    switch (b_mode)
-    {
-    case B_DC_PRED:
-    {
-        int expected_dc = 0;
-
-        for (i = 0; i < 4; i++)
-        {
-            expected_dc += Above[i];
-            expected_dc += Left[i];
-        }
-
-        expected_dc = (expected_dc + 4) >> 3;
-
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-                dst[c] = expected_dc;
-            }
-
-            dst += dst_stride;
-        }
-    }
-    break;
-    case B_TM_PRED:
-    {
-        /* prediction similar to true_motion prediction */
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-                int pred = Above[c] - top_left + Left[r];
-
-                if (pred < 0)
-                    pred = 0;
-
-                if (pred > 255)
-                    pred = 255;
-
-                dst[c] = pred;
-            }
-
-            dst += dst_stride;
-        }
-    }
-    break;
-
-    case B_VE_PRED:
-    {
-
-        unsigned int ap[4];
-        ap[0] = (top_left  + 2 * Above[0] + Above[1] + 2) >> 2;
-        ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2;
-        ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2;
-        ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2;
-
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-
-                dst[c] = ap[c];
-            }
-
-            dst += dst_stride;
-        }
-
-    }
-    break;
-
-
-    case B_HE_PRED:
-    {
-
-        unsigned int lp[4];
-        lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2;
-        lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2;
-        lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2;
-        lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2;
-
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-                dst[c] = lp[r];
-            }
-
-            dst += dst_stride;
-        }
-    }
-    break;
-    case B_LD_PRED:
-    {
-        unsigned char *ptr = Above;
-        dst[0 * dst_stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
-        dst[0 * dst_stride + 1] =
-            dst[1 * dst_stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
-        dst[0 * dst_stride + 2] =
-            dst[1 * dst_stride + 1] =
-                dst[2 * dst_stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
-        dst[0 * dst_stride + 3] =
-            dst[1 * dst_stride + 2] =
-                dst[2 * dst_stride + 1] =
-                    dst[3 * dst_stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
-        dst[1 * dst_stride + 3] =
-            dst[2 * dst_stride + 2] =
-                dst[3 * dst_stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
-        dst[2 * dst_stride + 3] =
-            dst[3 * dst_stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
-        dst[3 * dst_stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
-
-    }
-    break;
-    case B_RD_PRED:
-    {
-
-        unsigned char pp[9];
-
-        pp[0] = Left[3];
-        pp[1] = Left[2];
-        pp[2] = Left[1];
-        pp[3] = Left[0];
-        pp[4] = top_left;
-        pp[5] = Above[0];
-        pp[6] = Above[1];
-        pp[7] = Above[2];
-        pp[8] = Above[3];
-
-        dst[3 * dst_stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        dst[3 * dst_stride + 1] =
-            dst[2 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        dst[3 * dst_stride + 2] =
-            dst[2 * dst_stride + 1] =
-                dst[1 * dst_stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        dst[3 * dst_stride + 3] =
-            dst[2 * dst_stride + 2] =
-                dst[1 * dst_stride + 1] =
-                    dst[0 * dst_stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        dst[2 * dst_stride + 3] =
-            dst[1 * dst_stride + 2] =
-                dst[0 * dst_stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        dst[1 * dst_stride + 3] =
-            dst[0 * dst_stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-        dst[0 * dst_stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
-
-    }
-    break;
-    case B_VR_PRED:
-    {
-
-        unsigned char pp[9];
-
-        pp[0] = Left[3];
-        pp[1] = Left[2];
-        pp[2] = Left[1];
-        pp[3] = Left[0];
-        pp[4] = top_left;
-        pp[5] = Above[0];
-        pp[6] = Above[1];
-        pp[7] = Above[2];
-        pp[8] = Above[3];
-
-
-        dst[3 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        dst[2 * dst_stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        dst[3 * dst_stride + 1] =
-            dst[1 * dst_stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        dst[2 * dst_stride + 1] =
-            dst[0 * dst_stride + 0] = (pp[4] + pp[5] + 1) >> 1;
-        dst[3 * dst_stride + 2] =
-            dst[1 * dst_stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        dst[2 * dst_stride + 2] =
-            dst[0 * dst_stride + 1] = (pp[5] + pp[6] + 1) >> 1;
-        dst[3 * dst_stride + 3] =
-            dst[1 * dst_stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-        dst[2 * dst_stride + 3] =
-            dst[0 * dst_stride + 2] = (pp[6] + pp[7] + 1) >> 1;
-        dst[1 * dst_stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
-        dst[0 * dst_stride + 3] = (pp[7] + pp[8] + 1) >> 1;
-
-    }
-    break;
-    case B_VL_PRED:
-    {
-
-        unsigned char *pp = Above;
-
-        dst[0 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1;
-        dst[1 * dst_stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        dst[2 * dst_stride + 0] =
-            dst[0 * dst_stride + 1] = (pp[1] + pp[2] + 1) >> 1;
-        dst[1 * dst_stride + 1] =
-            dst[3 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        dst[2 * dst_stride + 1] =
-            dst[0 * dst_stride + 2] = (pp[2] + pp[3] + 1) >> 1;
-        dst[3 * dst_stride + 1] =
-            dst[1 * dst_stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        dst[0 * dst_stride + 3] =
-            dst[2 * dst_stride + 2] = (pp[3] + pp[4] + 1) >> 1;
-        dst[1 * dst_stride + 3] =
-            dst[3 * dst_stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        dst[2 * dst_stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        dst[3 * dst_stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-    }
-    break;
-
-    case B_HD_PRED:
-    {
-        unsigned char pp[9];
-        pp[0] = Left[3];
-        pp[1] = Left[2];
-        pp[2] = Left[1];
-        pp[3] = Left[0];
-        pp[4] = top_left;
-        pp[5] = Above[0];
-        pp[6] = Above[1];
-        pp[7] = Above[2];
-        pp[8] = Above[3];
-
-
-        dst[3 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1;
-        dst[3 * dst_stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        dst[2 * dst_stride + 0] =
-            dst[3 * dst_stride + 2] = (pp[1] + pp[2] + 1) >> 1;
-        dst[2 * dst_stride + 1] =
-            dst[3 * dst_stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        dst[2 * dst_stride + 2] =
-            dst[1 * dst_stride + 0] = (pp[2] + pp[3] + 1) >> 1;
-        dst[2 * dst_stride + 3] =
-            dst[1 * dst_stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        dst[1 * dst_stride + 2] =
-            dst[0 * dst_stride + 0] = (pp[3] + pp[4] + 1) >> 1;
-        dst[1 * dst_stride + 3] =
-            dst[0 * dst_stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        dst[0 * dst_stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        dst[0 * dst_stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-    }
-    break;
-
-
-    case B_HU_PRED:
-    {
-        unsigned char *pp = Left;
-        dst[0 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1;
-        dst[0 * dst_stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        dst[0 * dst_stride + 2] =
-            dst[1 * dst_stride + 0] = (pp[1] + pp[2] + 1) >> 1;
-        dst[0 * dst_stride + 3] =
-            dst[1 * dst_stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        dst[1 * dst_stride + 2] =
-            dst[2 * dst_stride + 0] = (pp[2] + pp[3] + 1) >> 1;
-        dst[1 * dst_stride + 3] =
-            dst[2 * dst_stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
-        dst[2 * dst_stride + 2] =
-            dst[2 * dst_stride + 3] =
-                dst[3 * dst_stride + 0] =
-                    dst[3 * dst_stride + 1] =
-                        dst[3 * dst_stride + 2] =
-                            dst[3 * dst_stride + 3] = pp[3];
-    }
-    break;
-
-    default:
-    break;
-
-    }
+    pred[b_mode](dst, dst_stride, Above, Left);
 }
diff --git a/libvpx/vp8/common/reconintra4x4.h b/libvpx/vp8/common/reconintra4x4.h
index ed59c9ed..869841ee 100644
--- a/libvpx/vp8/common/reconintra4x4.h
+++ b/libvpx/vp8/common/reconintra4x4.h
@@ -18,7 +18,7 @@ extern "C" {
 #endif
 
 static void intra_prediction_down_copy(MACROBLOCKD *xd,
-                                             unsigned char *above_right_src)
+                                       unsigned char *above_right_src)
 {
     int dst_stride = xd->dst.y_stride;
     unsigned char *above_right_dst = xd->dst.y_buffer - dst_stride + 16;
@@ -33,6 +33,14 @@ static void intra_prediction_down_copy(MACROBLOCKD *xd,
     *dst_ptr2 = *src_ptr;
 }
 
+void vp8_intra4x4_predict(unsigned char *Above,
+                          unsigned char *yleft, int left_stride,
+                          B_PREDICTION_MODE b_mode,
+                          unsigned char *dst, int dst_stride,
+                          unsigned char top_left);
+
+void vp8_init_intra4x4_predictors_internal(void);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vp8/common/rtcd_defs.pl b/libvpx/vp8/common/rtcd_defs.pl
index 7924ae75..6799c278 100644
--- a/libvpx/vp8/common/rtcd_defs.pl
+++ b/libvpx/vp8/common/rtcd_defs.pl
@@ -152,16 +152,6 @@ specialize qw/vp8_copy_mem8x4 mmx media neon dspr2 msa/;
 $vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6;
 $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
 
-add_proto qw/void vp8_build_intra_predictors_mby_s/, "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride";
-specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3 neon msa/;
-
-add_proto qw/void vp8_build_intra_predictors_mbuv_s/, "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride";
-specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3 neon msa/;
-
-add_proto qw/void vp8_intra4x4_predict/, "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left";
-specialize qw/vp8_intra4x4_predict media/;
-$vp8_intra4x4_predict_media=vp8_intra4x4_predict_armv6;
-
 #
 # Postproc
 #
diff --git a/libvpx/vp8/common/setupintrarecon.h b/libvpx/vp8/common/setupintrarecon.h
index 608f4a9a..1857c4e2 100644
--- a/libvpx/vp8/common/setupintrarecon.h
+++ b/libvpx/vp8/common/setupintrarecon.h
@@ -11,6 +11,7 @@
 #ifndef VP8_COMMON_SETUPINTRARECON_H_
 #define VP8_COMMON_SETUPINTRARECON_H_
 
+#include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 
 #ifdef __cplusplus
@@ -19,12 +20,11 @@ extern "C" {
 extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
 extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf);
 
-static
-void setup_intra_recon_left(unsigned char *y_buffer,
-                            unsigned char *u_buffer,
-                            unsigned char *v_buffer,
-                            int y_stride,
-                            int uv_stride)
+static INLINE void setup_intra_recon_left(unsigned char *y_buffer,
+                                          unsigned char *u_buffer,
+                                          unsigned char *v_buffer,
+                                          int y_stride,
+                                          int uv_stride)
 {
     int i;
 
diff --git a/libvpx/vp8/common/x86/recon_sse2.asm b/libvpx/vp8/common/x86/recon_sse2.asm
index 7141f832..cb89537f 100644
--- a/libvpx/vp8/common/x86/recon_sse2.asm
+++ b/libvpx/vp8/common/x86/recon_sse2.asm
@@ -114,1002 +114,3 @@ sym(vp8_copy_mem16x16_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-
-;void vp8_intra_pred_uv_dc_mmx2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride,
-;    )
-global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE
-sym(vp8_intra_pred_uv_dc_mmx2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; from top
-    mov         rdi,        arg(2) ;above;
-    mov         rsi,        arg(3) ;left;
-    movsxd      rax,        dword ptr arg(4) ;left_stride;
-    pxor        mm0,        mm0
-    movq        mm1,        [rdi]
-    lea         rdi,        [rax*3]
-    psadbw      mm1,        mm0
-    ; from left
-    movzx       ecx,        byte [rsi]
-    movzx       edx,        byte [rsi+rax*1]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-
-    movzx       edx,        byte [rsi+rdi]
-    lea         rsi,        [rsi+rax*4]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-
-    ; add up
-    pextrw      edx,        mm1, 0x0
-    lea         edx,        [edx+ecx+8]
-    sar         edx,        4
-    movd        mm1,        edx
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-    pshufw      mm1,        mm1, 0x0
-    mov         rdi,        arg(0) ;dst;
-    packuswb    mm1,        mm1
-
-    ; write out
-    lea         rax,        [rcx*3]
-    lea         rdx,        [rdi+rcx*4]
-
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-    movq [rdx      ],       mm1
-    movq [rdx+rcx  ],       mm1
-    movq [rdx+rcx*2],       mm1
-    movq [rdx+rax  ],       mm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_intra_pred_uv_dctop_mmx2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride,
-;    )
-global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE
-sym(vp8_intra_pred_uv_dctop_mmx2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;arg(3), arg(4) not used
-
-    ; from top
-    mov         rsi,        arg(2) ;above;
-    pxor        mm0,        mm0
-    movq        mm1,        [rsi]
-    psadbw      mm1,        mm0
-
-    ; add up
-    paddw       mm1,        [GLOBAL(dc_4)]
-    psraw       mm1,        3
-    pshufw      mm1,        mm1, 0x0
-    packuswb    mm1,        mm1
-
-    ; write out
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-    lea         rax,        [rcx*3]
-
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-    lea         rdi,        [rdi+rcx*4]
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_intra_pred_uv_dcleft_mmx2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride,
-;    )
-global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE
-sym(vp8_intra_pred_uv_dcleft_mmx2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;arg(2) not used
-
-    ; from left
-    mov         rsi,        arg(3) ;left;
-    movsxd      rax,        dword ptr arg(4) ;left_stride;
-    lea         rdi,        [rax*3]
-    movzx       ecx,        byte [rsi]
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-    lea         rsi,        [rsi+rax*4]
-    movzx       edx,        byte [rsi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    lea         edx,        [ecx+edx+4]
-
-    ; add up
-    shr         edx,        3
-    movd        mm1,        edx
-    pshufw      mm1,        mm1, 0x0
-    packuswb    mm1,        mm1
-
-    ; write out
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-    lea         rax,        [rcx*3]
-
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-    lea         rdi,        [rdi+rcx*4]
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_intra_pred_uv_dc128_mmx(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride,
-;    )
-global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE
-sym(vp8_intra_pred_uv_dc128_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    ; end prolog
-
-    ;arg(2), arg(3), arg(4) not used
-
-    ; write out
-    movq        mm1,        [GLOBAL(dc_128)]
-    mov         rax,        arg(0) ;dst;
-    movsxd      rdx,        dword ptr arg(1) ;dst_stride
-    lea         rcx,        [rdx*3]
-
-    movq [rax      ],       mm1
-    movq [rax+rdx  ],       mm1
-    movq [rax+rdx*2],       mm1
-    movq [rax+rcx  ],       mm1
-    lea         rax,        [rax+rdx*4]
-    movq [rax      ],       mm1
-    movq [rax+rdx  ],       mm1
-    movq [rax+rdx*2],       mm1
-    movq [rax+rcx  ],       mm1
-
-    ; begin epilog
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_intra_pred_uv_tm_sse2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride,
-;    )
-%macro vp8_intra_pred_uv_tm 1
-global sym(vp8_intra_pred_uv_tm_%1) PRIVATE
-sym(vp8_intra_pred_uv_tm_%1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ; read top row
-    mov         edx,        4
-    mov         rsi,        arg(2) ;above
-    movsxd      rax,        dword ptr arg(4) ;left_stride;
-    pxor        xmm0,       xmm0
-%ifidn %1, ssse3
-    movdqa      xmm2,       [GLOBAL(dc_1024)]
-%endif
-    movq        xmm1,       [rsi]
-    punpcklbw   xmm1,       xmm0
-
-    ; set up left ptrs ans subtract topleft
-    movd        xmm3,       [rsi-1]
-    mov         rsi,        arg(3) ;left;
-%ifidn %1, sse2
-    punpcklbw   xmm3,       xmm0
-    pshuflw     xmm3,       xmm3, 0x0
-    punpcklqdq  xmm3,       xmm3
-%else
-    pshufb      xmm3,       xmm2
-%endif
-    psubw       xmm1,       xmm3
-
-    ; set up dest ptrs
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-
-.vp8_intra_pred_uv_tm_%1_loop:
-    mov         bl,         [rsi]
-    movd        xmm3,       ebx
-
-    mov         bl,         [rsi+rax]
-    movd        xmm5,       ebx
-%ifidn %1, sse2
-    punpcklbw   xmm3,       xmm0
-    punpcklbw   xmm5,       xmm0
-    pshuflw     xmm3,       xmm3, 0x0
-    pshuflw     xmm5,       xmm5, 0x0
-    punpcklqdq  xmm3,       xmm3
-    punpcklqdq  xmm5,       xmm5
-%else
-    pshufb      xmm3,       xmm2
-    pshufb      xmm5,       xmm2
-%endif
-    paddw       xmm3,       xmm1
-    paddw       xmm5,       xmm1
-    packuswb    xmm3,       xmm5
-    movq  [rdi    ],        xmm3
-    movhps[rdi+rcx],        xmm3
-    lea         rsi,        [rsi+rax*2]
-    lea         rdi,        [rdi+rcx*2]
-    dec         edx
-    jnz .vp8_intra_pred_uv_tm_%1_loop
-
-    ; begin epilog
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endmacro
-
-vp8_intra_pred_uv_tm sse2
-vp8_intra_pred_uv_tm ssse3
-
-;void vp8_intra_pred_uv_ve_mmx(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride,
-;    )
-global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE
-sym(vp8_intra_pred_uv_ve_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    ; end prolog
-
-    ; arg(3), arg(4) not used
-
-    ; read from top
-    mov         rax,        arg(2) ;src;
-
-    movq        mm1,        [rax]
-
-    ; write out
-    mov         rax,        arg(0) ;dst;
-    movsxd      rdx,        dword ptr arg(1) ;dst_stride
-    lea         rcx,        [rdx*3]
-
-    movq [rax      ],       mm1
-    movq [rax+rdx  ],       mm1
-    movq [rax+rdx*2],       mm1
-    movq [rax+rcx  ],       mm1
-    lea         rax,        [rax+rdx*4]
-    movq [rax      ],       mm1
-    movq [rax+rdx  ],       mm1
-    movq [rax+rdx*2],       mm1
-    movq [rax+rcx  ],       mm1
-
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_intra_pred_uv_ho_mmx2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride
-;    )
-%macro vp8_intra_pred_uv_ho 1
-global sym(vp8_intra_pred_uv_ho_%1) PRIVATE
-sym(vp8_intra_pred_uv_ho_%1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    push        rbx
-%ifidn %1, ssse3
-    GET_GOT     rbx
-%endif
-    ; end prolog
-
-    ;arg(2) not used
-
-    ; read from left and write out
-%ifidn %1, mmx2
-    mov         edx,        4
-%endif
-    mov         rsi,        arg(3) ;left
-    movsxd      rax,        dword ptr arg(4) ;left_stride;
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-%ifidn %1, ssse3
-    lea         rdx,        [rcx*3]
-    movdqa      xmm2,       [GLOBAL(dc_00001111)]
-%endif
-
-%ifidn %1, mmx2
-.vp8_intra_pred_uv_ho_%1_loop:
-    mov         bl,         [rsi]
-    movd        mm0,        ebx
-
-    mov         bl,         [rsi+rax]
-    movd        mm1,        ebx
-
-    punpcklbw   mm0,        mm0
-    punpcklbw   mm1,        mm1
-    pshufw      mm0,        mm0, 0x0
-    pshufw      mm1,        mm1, 0x0
-    movq  [rdi    ],        mm0
-    movq  [rdi+rcx],        mm1
-    lea         rsi,        [rsi+rax*2]
-    lea         rdi,        [rdi+rcx*2]
-    dec         edx
-    jnz .vp8_intra_pred_uv_ho_%1_loop
-%else
-    mov         bl,         [rsi]
-    movd        xmm0,       ebx
-
-    mov         bl,         [rsi+rax]
-    movd        xmm3,       ebx
-
-    mov         bl,         [rsi+rax*2]
-    movd        xmm1,       ebx
-
-    lea         rbx,        [rax*3]
-    mov         bl,         [rsi+rbx]
-    movd        xmm4,       ebx
-
-    punpcklbw   xmm0,       xmm3
-    punpcklbw   xmm1,       xmm4
-    pshufb      xmm0,       xmm2
-    pshufb      xmm1,       xmm2
-    movq   [rdi    ],       xmm0
-    movhps [rdi+rcx],       xmm0
-    movq [rdi+rcx*2],       xmm1
-    movhps [rdi+rdx],       xmm1
-    lea         rsi,        [rsi+rax*4]
-    lea         rdi,        [rdi+rcx*4]
-
-    mov         bl,         [rsi]
-    movd        xmm0,       ebx
-
-    mov         bl,         [rsi+rax]
-    movd        xmm3,       ebx
-
-    mov         bl,         [rsi+rax*2]
-    movd        xmm1,       ebx
-
-    lea         rbx,        [rax*3]
-    mov         bl,         [rsi+rbx]
-    movd        xmm4,       ebx
-
-    punpcklbw   xmm0,       xmm3
-    punpcklbw   xmm1,       xmm4
-    pshufb      xmm0,       xmm2
-    pshufb      xmm1,       xmm2
-    movq   [rdi    ],       xmm0
-    movhps [rdi+rcx],       xmm0
-    movq [rdi+rcx*2],       xmm1
-    movhps [rdi+rdx],       xmm1
-%endif
-
-    ; begin epilog
-%ifidn %1, ssse3
-    RESTORE_GOT
-%endif
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endmacro
-
-vp8_intra_pred_uv_ho mmx2
-vp8_intra_pred_uv_ho ssse3
-
-;void vp8_intra_pred_y_dc_sse2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride
-;    )
-global sym(vp8_intra_pred_y_dc_sse2) PRIVATE
-sym(vp8_intra_pred_y_dc_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; from top
-    mov         rdi,        arg(2) ;above
-    mov         rsi,        arg(3) ;left
-    movsxd      rax,        dword ptr arg(4) ;left_stride;
-
-    pxor        xmm0,       xmm0
-    movdqa      xmm1,       [rdi]
-    psadbw      xmm1,       xmm0
-    movq        xmm2,       xmm1
-    punpckhqdq  xmm1,       xmm1
-    paddw       xmm1,       xmm2
-
-    ; from left
-    lea         rdi,        [rax*3]
-
-    movzx       ecx,        byte [rsi]
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-    lea         rsi,        [rsi+rax*4]
-
-    movzx       edx,        byte [rsi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-    lea         rsi,        [rsi+rax*4]
-
-    movzx       edx,        byte [rsi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-    lea         rsi,        [rsi+rax*4]
-
-    movzx       edx,        byte [rsi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-
-    ; add up
-    pextrw      edx,        xmm1, 0x0
-    lea         edx,        [edx+ecx+16]
-    sar         edx,        5
-    movd        xmm1,       edx
-    ; FIXME use pshufb for ssse3 version
-    pshuflw     xmm1,       xmm1, 0x0
-    punpcklqdq  xmm1,       xmm1
-    packuswb    xmm1,       xmm1
-
-    ; write out
-    mov         rsi,        2
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-    lea         rax,        [rcx*3]
-
-.label
-    movdqa [rdi      ],     xmm1
-    movdqa [rdi+rcx  ],     xmm1
-    movdqa [rdi+rcx*2],     xmm1
-    movdqa [rdi+rax  ],     xmm1
-    lea         rdi,        [rdi+rcx*4]
-    movdqa [rdi      ],     xmm1
-    movdqa [rdi+rcx  ],     xmm1
-    movdqa [rdi+rcx*2],     xmm1
-    movdqa [rdi+rax  ],     xmm1
-    lea         rdi,        [rdi+rcx*4]
-    dec         rsi
-    jnz .label
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_intra_pred_y_dctop_sse2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride
-;    )
-global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE
-sym(vp8_intra_pred_y_dctop_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    GET_GOT     rbx
-    ; end prolog
-
-    ;arg(3), arg(4) not used
-
-    ; from top
-    mov         rcx,        arg(2) ;above;
-    pxor        xmm0,       xmm0
-    movdqa      xmm1,       [rcx]
-    psadbw      xmm1,       xmm0
-    movdqa      xmm2,       xmm1
-    punpckhqdq  xmm1,       xmm1
-    paddw       xmm1,       xmm2
-
-    ; add up
-    paddw       xmm1,       [GLOBAL(dc_8)]
-    psraw       xmm1,       4
-    ; FIXME use pshufb for ssse3 version
-    pshuflw     xmm1,       xmm1, 0x0
-    punpcklqdq  xmm1,       xmm1
-    packuswb    xmm1,       xmm1
-
-    ; write out
-    mov         rsi,        2
-    mov         rdx,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-    lea         rax,        [rcx*3]
-
-.label
-    movdqa [rdx      ],     xmm1
-    movdqa [rdx+rcx  ],     xmm1
-    movdqa [rdx+rcx*2],     xmm1
-    movdqa [rdx+rax  ],     xmm1
-    lea         rdx,        [rdx+rcx*4]
-    movdqa [rdx      ],     xmm1
-    movdqa [rdx+rcx  ],     xmm1
-    movdqa [rdx+rcx*2],     xmm1
-    movdqa [rdx+rax  ],     xmm1
-    lea         rdx,        [rdx+rcx*4]
-    dec         rsi
-    jnz .label
-
-    ; begin epilog
-    RESTORE_GOT
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_intra_pred_y_dcleft_sse2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride
-;    )
-global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE
-sym(vp8_intra_pred_y_dcleft_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;arg(2) not used
-
-    ; from left
-    mov         rsi,        arg(3) ;left;
-    movsxd      rax,        dword ptr arg(4) ;left_stride;
-
-    lea         rdi,        [rax*3]
-    movzx       ecx,        byte [rsi]
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-    lea         rsi,        [rsi+rax*4]
-    movzx       edx,        byte [rsi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-    lea         rsi,        [rsi+rax*4]
-    movzx       edx,        byte [rsi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-    lea         rsi,        [rsi+rax*4]
-    movzx       edx,        byte [rsi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    lea         edx,        [ecx+edx+8]
-
-    ; add up
-    shr         edx,        4
-    movd        xmm1,       edx
-    ; FIXME use pshufb for ssse3 version
-    pshuflw     xmm1,       xmm1, 0x0
-    punpcklqdq  xmm1,       xmm1
-    packuswb    xmm1,       xmm1
-
-    ; write out
-    mov         rsi,        2
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-    lea         rax,        [rcx*3]
-
-.label
-    movdqa [rdi      ],     xmm1
-    movdqa [rdi+rcx  ],     xmm1
-    movdqa [rdi+rcx*2],     xmm1
-    movdqa [rdi+rax  ],     xmm1
-    lea         rdi,        [rdi+rcx*4]
-    movdqa [rdi      ],     xmm1
-    movdqa [rdi+rcx  ],     xmm1
-    movdqa [rdi+rcx*2],     xmm1
-    movdqa [rdi+rax  ],     xmm1
-    lea         rdi,        [rdi+rcx*4]
-    dec         rsi
-    jnz .label
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_intra_pred_y_dc128_sse2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride
-;    )
-global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE
-sym(vp8_intra_pred_y_dc128_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    GET_GOT     rbx
-    ; end prolog
-
-    ;arg(2), arg(3), arg(4) not used
-
-    ; write out
-    mov         rsi,        2
-    movdqa      xmm1,       [GLOBAL(dc_128)]
-    mov         rax,        arg(0) ;dst;
-    movsxd      rdx,        dword ptr arg(1) ;dst_stride
-    lea         rcx,        [rdx*3]
-
-.label
-    movdqa [rax      ],     xmm1
-    movdqa [rax+rdx  ],     xmm1
-    movdqa [rax+rdx*2],     xmm1
-    movdqa [rax+rcx  ],     xmm1
-    lea         rax,        [rax+rdx*4]
-    movdqa [rax      ],     xmm1
-    movdqa [rax+rdx  ],     xmm1
-    movdqa [rax+rdx*2],     xmm1
-    movdqa [rax+rcx  ],     xmm1
-    lea         rax,        [rax+rdx*4]
-    dec         rsi
-    jnz .label
-
-    ; begin epilog
-    RESTORE_GOT
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_intra_pred_y_tm_sse2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride
-;    )
-%macro vp8_intra_pred_y_tm 1
-global sym(vp8_intra_pred_y_tm_%1) PRIVATE
-sym(vp8_intra_pred_y_tm_%1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    GET_GOT     rbx
-    ; end prolog
-
-    ; read top row
-    mov         edx,        8
-    mov         rsi,        arg(2) ;above
-    movsxd      rax,        dword ptr arg(4) ;left_stride;
-    pxor        xmm0,       xmm0
-%ifidn %1, ssse3
-    movdqa      xmm3,       [GLOBAL(dc_1024)]
-%endif
-    movdqa      xmm1,       [rsi]
-    movdqa      xmm2,       xmm1
-    punpcklbw   xmm1,       xmm0
-    punpckhbw   xmm2,       xmm0
-
-    ; set up left ptrs ans subtract topleft
-    movd        xmm4,       [rsi-1]
-    mov         rsi,        arg(3) ;left
-%ifidn %1, sse2
-    punpcklbw   xmm4,       xmm0
-    pshuflw     xmm4,       xmm4, 0x0
-    punpcklqdq  xmm4,       xmm4
-%else
-    pshufb      xmm4,       xmm3
-%endif
-    psubw       xmm1,       xmm4
-    psubw       xmm2,       xmm4
-
-    ; set up dest ptrs
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-vp8_intra_pred_y_tm_%1_loop:
-    mov         bl,         [rsi]
-    movd        xmm4,       ebx
-
-    mov         bl,         [rsi+rax]
-    movd        xmm5,       ebx
-%ifidn %1, sse2
-    punpcklbw   xmm4,       xmm0
-    punpcklbw   xmm5,       xmm0
-    pshuflw     xmm4,       xmm4, 0x0
-    pshuflw     xmm5,       xmm5, 0x0
-    punpcklqdq  xmm4,       xmm4
-    punpcklqdq  xmm5,       xmm5
-%else
-    pshufb      xmm4,       xmm3
-    pshufb      xmm5,       xmm3
-%endif
-    movdqa      xmm6,       xmm4
-    movdqa      xmm7,       xmm5
-    paddw       xmm4,       xmm1
-    paddw       xmm6,       xmm2
-    paddw       xmm5,       xmm1
-    paddw       xmm7,       xmm2
-    packuswb    xmm4,       xmm6
-    packuswb    xmm5,       xmm7
-    movdqa [rdi    ],       xmm4
-    movdqa [rdi+rcx],       xmm5
-    lea         rsi,        [rsi+rax*2]
-    lea         rdi,        [rdi+rcx*2]
-    dec         edx
-    jnz vp8_intra_pred_y_tm_%1_loop
-
-    ; begin epilog
-    RESTORE_GOT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endmacro
-
-vp8_intra_pred_y_tm sse2
-vp8_intra_pred_y_tm ssse3
-
-;void vp8_intra_pred_y_ve_sse2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride
-;    )
-global sym(vp8_intra_pred_y_ve_sse2) PRIVATE
-sym(vp8_intra_pred_y_ve_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    ; end prolog
-
-    ;arg(3), arg(4) not used
-
-    mov         rax,        arg(2) ;above;
-    mov         rsi,        2
-    movsxd      rdx,        dword ptr arg(1) ;dst_stride
-
-    ; read from top
-    movdqa      xmm1,       [rax]
-
-    ; write out
-    mov         rax,        arg(0) ;dst;
-    lea         rcx,        [rdx*3]
-
-.label
-    movdqa [rax      ],     xmm1
-    movdqa [rax+rdx  ],     xmm1
-    movdqa [rax+rdx*2],     xmm1
-    movdqa [rax+rcx  ],     xmm1
-    lea         rax,        [rax+rdx*4]
-    movdqa [rax      ],     xmm1
-    movdqa [rax+rdx  ],     xmm1
-    movdqa [rax+rdx*2],     xmm1
-    movdqa [rax+rcx  ],     xmm1
-    lea         rax,        [rax+rdx*4]
-    dec         rsi
-    jnz .label
-
-    ; begin epilog
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_intra_pred_y_ho_sse2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *above,
-;    unsigned char *left,
-;    int left_stride,
-;    )
-global sym(vp8_intra_pred_y_ho_sse2) PRIVATE
-sym(vp8_intra_pred_y_ho_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ;arg(2) not used
-
-    ; read from left and write out
-    mov         edx,        8
-    mov         rsi,        arg(3) ;left;
-    movsxd      rax,        dword ptr arg(4) ;left_stride;
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-
-vp8_intra_pred_y_ho_sse2_loop:
-    mov         bl,         [rsi]
-    movd        xmm0,       ebx
-    mov         bl,         [rsi+rax]
-    movd        xmm1,       ebx
-
-    ; FIXME use pshufb for ssse3 version
-    punpcklbw   xmm0,       xmm0
-    punpcklbw   xmm1,       xmm1
-    pshuflw     xmm0,       xmm0, 0x0
-    pshuflw     xmm1,       xmm1, 0x0
-    punpcklqdq  xmm0,       xmm0
-    punpcklqdq  xmm1,       xmm1
-    movdqa [rdi    ],       xmm0
-    movdqa [rdi+rcx],       xmm1
-    lea         rsi,        [rsi+rax*2]
-    lea         rdi,        [rdi+rcx*2]
-    dec         edx
-    jnz vp8_intra_pred_y_ho_sse2_loop
-
-    ; begin epilog
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-dc_128:
-    times 16 db 128
-dc_4:
-    times 4 dw 4
-align 16
-dc_8:
-    times 8 dw 8
-align 16
-dc_1024:
-    times 8 dw 0x400
-align 16
-dc_00001111:
-    times 8 db 0
-    times 8 db 1
diff --git a/libvpx/vp8/common/x86/recon_wrapper_sse2.c b/libvpx/vp8/common/x86/recon_wrapper_sse2.c
deleted file mode 100644
index 65f4251a..00000000
--- a/libvpx/vp8/common/x86/recon_wrapper_sse2.c
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/blockd.h"
-
-#define build_intra_predictors_mbuv_prototype(sym) \
-    void sym(unsigned char *dst, int dst_stride, \
-             const unsigned char *above, \
-             const unsigned char *left, int left_stride)
-typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t));
-
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_ssse3);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
-
-static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
-                                                unsigned char * uabove_row,
-                                                unsigned char * vabove_row,
-                                                unsigned char *dst_u,
-                                                unsigned char *dst_v,
-                                                int dst_stride,
-                                                unsigned char * uleft,
-                                                unsigned char * vleft,
-                                                int left_stride,
-                                                build_intra_predictors_mbuv_fn_t tm_func,
-                                                build_intra_predictors_mbuv_fn_t ho_func)
-{
-    int mode = x->mode_info_context->mbmi.uv_mode;
-    build_intra_predictors_mbuv_fn_t fn;
-
-    switch (mode) {
-        case  V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
-        case  H_PRED: fn = ho_func; break;
-        case TM_PRED: fn = tm_func; break;
-        case DC_PRED:
-            if (x->up_available) {
-                if (x->left_available) {
-                    fn = vp8_intra_pred_uv_dc_mmx2; break;
-                } else {
-                    fn = vp8_intra_pred_uv_dctop_mmx2; break;
-                }
-            } else if (x->left_available) {
-                fn = vp8_intra_pred_uv_dcleft_mmx2; break;
-            } else {
-                fn = vp8_intra_pred_uv_dc128_mmx; break;
-            }
-            break;
-        default: return;
-    }
-
-    fn(dst_u, dst_stride, uabove_row, uleft, left_stride);
-    fn(dst_v, dst_stride, vabove_row, vleft, left_stride);
-}
-
-void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x,
-                                            unsigned char * uabove_row,
-                                            unsigned char * vabove_row,
-                                            unsigned char * uleft,
-                                            unsigned char * vleft,
-                                            int left_stride,
-                                            unsigned char * upred_ptr,
-                                            unsigned char * vpred_ptr,
-                                            int pred_stride)
-{
-    vp8_build_intra_predictors_mbuv_x86(x,
-                                        uabove_row, vabove_row,
-                                        upred_ptr,
-                                        vpred_ptr, pred_stride,
-                                        uleft,
-                                        vleft,
-                                        left_stride,
-                                        vp8_intra_pred_uv_tm_sse2,
-                                        vp8_intra_pred_uv_ho_mmx2);
-}
-
-void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x,
-                                             unsigned char * uabove_row,
-                                             unsigned char * vabove_row,
-                                             unsigned char * uleft,
-                                             unsigned char * vleft,
-                                             int left_stride,
-                                             unsigned char * upred_ptr,
-                                             unsigned char * vpred_ptr,
-                                             int pred_stride)
-{
-    vp8_build_intra_predictors_mbuv_x86(x,
-                                        uabove_row, vabove_row,
-                                        upred_ptr,
-                                        vpred_ptr, pred_stride,
-                                        uleft,
-                                        vleft,
-                                        left_stride,
-                                        vp8_intra_pred_uv_tm_ssse3,
-                                        vp8_intra_pred_uv_ho_ssse3);
-}
-
-#define build_intra_predictors_mby_prototype(sym) \
-    void sym(unsigned char *dst, int dst_stride, \
-             const unsigned char *above, \
-             const unsigned char *left, int left_stride)
-typedef build_intra_predictors_mby_prototype((*build_intra_predictors_mby_fn_t));
-
-extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc_sse2);
-extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dctop_sse2);
-extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dcleft_sse2);
-extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc128_sse2);
-extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ho_sse2);
-extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ve_sse2);
-extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_sse2);
-extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_ssse3);
-
-static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x,
-                                               unsigned char * yabove_row,
-                                               unsigned char *dst_y,
-                                               int dst_stride,
-                                               unsigned char * yleft,
-                                               int left_stride,
-                                               build_intra_predictors_mby_fn_t tm_func)
-{
-    int mode = x->mode_info_context->mbmi.mode;
-    build_intra_predictors_mbuv_fn_t fn;
-
-    switch (mode) {
-        case  V_PRED: fn = vp8_intra_pred_y_ve_sse2; break;
-        case  H_PRED: fn = vp8_intra_pred_y_ho_sse2; break;
-        case TM_PRED: fn = tm_func; break;
-        case DC_PRED:
-            if (x->up_available) {
-                if (x->left_available) {
-                    fn = vp8_intra_pred_y_dc_sse2; break;
-                } else {
-                    fn = vp8_intra_pred_y_dctop_sse2; break;
-                }
-            } else if (x->left_available) {
-                fn = vp8_intra_pred_y_dcleft_sse2; break;
-            } else {
-                fn = vp8_intra_pred_y_dc128_sse2; break;
-            }
-            break;
-        default: return;
-    }
-
-    fn(dst_y, dst_stride, yabove_row, yleft, left_stride);
-    return;
-}
-
-void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x,
-                                           unsigned char * yabove_row,
-                                           unsigned char * yleft,
-                                           int left_stride,
-                                           unsigned char * ypred_ptr,
-                                           int y_stride)
-{
-    vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr,
-                                       y_stride, yleft, left_stride,
-                                       vp8_intra_pred_y_tm_sse2);
-}
-
-void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x,
-                                            unsigned char * yabove_row,
-                                            unsigned char * yleft,
-                                            int left_stride,
-                                            unsigned char * ypred_ptr,
-                                            int y_stride)
-{
-    vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr,
-                                     y_stride, yleft, left_stride,
-                                       vp8_intra_pred_y_tm_ssse3);
-
-}
diff --git a/libvpx/vp8/decoder/dboolhuff.c b/libvpx/vp8/decoder/dboolhuff.c
index b874d4c4..8a7e3320 100644
--- a/libvpx/vp8/decoder/dboolhuff.c
+++ b/libvpx/vp8/decoder/dboolhuff.c
@@ -11,6 +11,7 @@
 
 #include "dboolhuff.h"
 #include "vp8/common/common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 int vp8dx_start_decode(BOOL_DECODER *br,
                        const unsigned char *source,
@@ -48,7 +49,7 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
     unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1];
 
     if (br->decrypt_cb) {
-        size_t n = MIN(sizeof(decrypted), bytes_left);
+        size_t n = VPXMIN(sizeof(decrypted), bytes_left);
         br->decrypt_cb(br->decrypt_state, bufptr, decrypted, (int)n);
         bufptr = decrypted;
     }
diff --git a/libvpx/vp8/decoder/dboolhuff.h b/libvpx/vp8/decoder/dboolhuff.h
index 51c5adc2..cc9eaaf4 100644
--- a/libvpx/vp8/decoder/dboolhuff.h
+++ b/libvpx/vp8/decoder/dboolhuff.h
@@ -15,7 +15,7 @@
 #include <stddef.h>
 #include <limits.h>
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_integer.h"
@@ -95,7 +95,7 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
     return bit;
 }
 
-static int vp8_decode_value(BOOL_DECODER *br, int bits)
+static INLINE int vp8_decode_value(BOOL_DECODER *br, int bits)
 {
     int z = 0;
     int bit;
@@ -108,7 +108,7 @@ static int vp8_decode_value(BOOL_DECODER *br, int bits)
     return z;
 }
 
-static int vp8dx_bool_error(BOOL_DECODER *br)
+static INLINE int vp8dx_bool_error(BOOL_DECODER *br)
 {
     /* Check if we have reached the end of the buffer.
      *
diff --git a/libvpx/vp8/decoder/decodeframe.c b/libvpx/vp8/decoder/decodeframe.c
index 56e167db..f0d76037 100644
--- a/libvpx/vp8/decoder/decodeframe.c
+++ b/libvpx/vp8/decoder/decodeframe.c
@@ -23,6 +23,7 @@
 #include "vp8/common/entropymode.h"
 #include "vp8/common/quant_common.h"
 #include "vpx_scale/vpx_scale.h"
+#include "vp8/common/reconintra.h"
 #include "vp8/common/setupintrarecon.h"
 
 #include "decodemv.h"
@@ -34,6 +35,7 @@
 #include "vp8/common/threading.h"
 #include "decoderthreading.h"
 #include "dboolhuff.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -1021,7 +1023,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         const unsigned char *clear = data;
         if (pbi->decrypt_cb)
         {
-            int n = (int)MIN(sizeof(clear_buffer), data_end - data);
+            int n = (int)VPXMIN(sizeof(clear_buffer), data_end - data);
             pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n);
             clear = clear_buffer;
         }
diff --git a/libvpx/vp8/decoder/error_concealment.c b/libvpx/vp8/decoder/error_concealment.c
index bb6d443c..0b846a08 100644
--- a/libvpx/vp8/decoder/error_concealment.c
+++ b/libvpx/vp8/decoder/error_concealment.c
@@ -16,6 +16,7 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/findnearmv.h"
 #include "vp8/common/common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 #define FLOOR(x,q) ((x) & -(1 << (q)))
 
@@ -93,13 +94,13 @@ static void assign_overlap(OVERLAP_NODE* overlaps,
  */
 static int block_overlap(int b1_row, int b1_col, int b2_row, int b2_col)
 {
-    const int int_top = MAX(b1_row, b2_row); // top
-    const int int_left = MAX(b1_col, b2_col); // left
+    const int int_top = VPXMAX(b1_row, b2_row); // top
+    const int int_left = VPXMAX(b1_col, b2_col); // left
     /* Since each block is 4x4 pixels, adding 4 (Q3) to the left/top edge
      * gives us the right/bottom edge.
      */
-    const int int_right = MIN(b1_col + (4<<3), b2_col + (4<<3)); // right
-    const int int_bottom = MIN(b1_row + (4<<3), b2_row + (4<<3)); // bottom
+    const int int_right = VPXMIN(b1_col + (4<<3), b2_col + (4<<3)); // right
+    const int int_bottom = VPXMIN(b1_row + (4<<3), b2_row + (4<<3)); // bottom
     return (int_bottom - int_top) * (int_right - int_left);
 }
 
@@ -124,7 +125,7 @@ static void calculate_overlaps_mb(B_OVERLAP *b_overlaps, union b_mode_info *bmi,
     /* If the block partly overlaps any previous MB, these coordinates
      * can be < 0. We don't want to access blocks in previous MBs.
      */
-    const int blk_idx = MAX(rel_ol_blk_row,0) * 4 + MAX(rel_ol_blk_col,0);
+    const int blk_idx = VPXMAX(rel_ol_blk_row,0) * 4 + VPXMAX(rel_ol_blk_col,0);
     /* Upper left overlapping block */
     B_OVERLAP *b_ol_ul = &(b_overlaps[blk_idx]);
 
@@ -132,8 +133,8 @@ static void calculate_overlaps_mb(B_OVERLAP *b_overlaps, union b_mode_info *bmi,
      * which the motion compensated block overlaps
      */
     /* Avoid calculating overlaps for blocks in later MBs */
-    int end_row = MIN(4 + mb_row * 4 - first_blk_row, 2);
-    int end_col = MIN(4 + mb_col * 4 - first_blk_col, 2);
+    int end_row = VPXMIN(4 + mb_row * 4 - first_blk_row, 2);
+    int end_col = VPXMIN(4 + mb_col * 4 - first_blk_col, 2);
     int row, col;
 
     /* Check if new_row and new_col are evenly divisible by 4 (Q3),
@@ -208,8 +209,8 @@ void vp8_calculate_overlaps(MB_OVERLAP *overlap_ul,
     overlap_mb_row = FLOOR((overlap_b_row << 3) / 4, 3) >> 3;
     overlap_mb_col = FLOOR((overlap_b_col << 3) / 4, 3) >> 3;
 
-    end_row = MIN(mb_rows - overlap_mb_row, 2);
-    end_col = MIN(mb_cols - overlap_mb_col, 2);
+    end_row = VPXMIN(mb_rows - overlap_mb_row, 2);
+    end_col = VPXMIN(mb_cols - overlap_mb_col, 2);
 
     /* Don't calculate overlap for MBs we don't overlap */
     /* Check if the new block row starts at the last block row of the MB */
diff --git a/libvpx/vp8/decoder/onyxd_if.c b/libvpx/vp8/decoder/onyxd_if.c
index 9015fcbb..3468268a 100644
--- a/libvpx/vp8/decoder/onyxd_if.c
+++ b/libvpx/vp8/decoder/onyxd_if.c
@@ -25,9 +25,12 @@
 #include <assert.h>
 
 #include "vp8/common/quant_common.h"
+#include "vp8/common/reconintra.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp8/common/systemdependent.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #include "detokenize.h"
 #if CONFIG_ERROR_CONCEALMENT
@@ -42,6 +45,17 @@ extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
 static int get_free_fb (VP8_COMMON *cm);
 static void ref_cnt_fb (int *buf, int *idx, int new_idx);
 
+static void initialize_dec(void) {
+    static volatile int init_done = 0;
+
+    if (!init_done)
+    {
+        vpx_dsp_rtcd();
+        vp8_init_intra_predictors();
+        init_done = 1;
+    }
+}
+
 static void remove_decompressor(VP8D_COMP *pbi)
 {
 #if CONFIG_ERROR_CONCEALMENT
@@ -105,6 +119,8 @@ static struct VP8D_COMP * create_decompressor(VP8D_CONFIG *oxcf)
 
     vp8_setup_block_dptrs(&pbi->mb);
 
+    once(initialize_dec);
+
     return pbi;
 }
 
diff --git a/libvpx/vp8/decoder/threading.c b/libvpx/vp8/decoder/threading.c
index 6801532f..7c7184c7 100644
--- a/libvpx/vp8/decoder/threading.c
+++ b/libvpx/vp8/decoder/threading.c
@@ -24,6 +24,7 @@
 #include "detokenize.h"
 #include "vp8/common/reconintra4x4.h"
 #include "vp8/common/reconinter.h"
+#include "vp8/common/reconintra.h"
 #include "vp8/common/setupintrarecon.h"
 #if CONFIG_ERROR_CONCEALMENT
 #include "error_concealment.h"
diff --git a/libvpx/vp8/decoder/treereader.h b/libvpx/vp8/decoder/treereader.h
index 35ee6960..f7d23c36 100644
--- a/libvpx/vp8/decoder/treereader.h
+++ b/libvpx/vp8/decoder/treereader.h
@@ -12,6 +12,7 @@
 #ifndef VP8_DECODER_TREEREADER_H_
 #define VP8_DECODER_TREEREADER_H_
 
+#include "./vpx_config.h"
 #include "vp8/common/treecoder.h"
 #include "dboolhuff.h"
 
@@ -28,7 +29,7 @@ typedef BOOL_DECODER vp8_reader;
 
 /* Intent of tree data structure is to make decoding trivial. */
 
-static int vp8_treed_read(
+static INLINE int vp8_treed_read(
     vp8_reader *const r,        /* !!! must return a 0 or 1 !!! */
     vp8_tree t,
     const vp8_prob *const p
diff --git a/libvpx/vp8/encoder/bitstream.c b/libvpx/vp8/encoder/bitstream.c
index ea279b32..f3d91b55 100644
--- a/libvpx/vp8/encoder/bitstream.c
+++ b/libvpx/vp8/encoder/bitstream.c
@@ -407,6 +407,7 @@ static void pack_tokens_into_partitions(VP8_COMP *cpi, unsigned char *cx_data,
 }
 
 
+#if CONFIG_MULTITHREAD
 static void pack_mb_row_tokens(VP8_COMP *cpi, vp8_writer *w)
 {
     int mb_row;
@@ -421,6 +422,7 @@ static void pack_mb_row_tokens(VP8_COMP *cpi, vp8_writer *w)
     }
 
 }
+#endif  // CONFIG_MULTITHREAD
 
 static void write_mv_ref
 (
@@ -1675,7 +1677,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
         if (cpi->b_multi_threaded)
             pack_mb_row_tokens(cpi, &cpi->bc[1]);
         else
-#endif
+#endif  // CONFIG_MULTITHREAD
             vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
 
         vp8_stop_encode(&cpi->bc[1]);
diff --git a/libvpx/vp8/encoder/encodeframe.c b/libvpx/vp8/encoder/encodeframe.c
index d381d8dd..b0aaa2f0 100644
--- a/libvpx/vp8/encoder/encodeframe.c
+++ b/libvpx/vp8/encoder/encodeframe.c
@@ -700,6 +700,7 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi)
     vp8_zero(x->count_mb_ref_frame_usage);
 }
 
+#if CONFIG_MULTITHREAD
 static void sum_coef_counts(MACROBLOCK *x, MACROBLOCK *x_thread)
 {
     int i = 0;
@@ -729,6 +730,7 @@ static void sum_coef_counts(MACROBLOCK *x, MACROBLOCK *x_thread)
     }
     while (++i < BLOCK_TYPES);
 }
+#endif  // CONFIG_MULTITHREAD
 
 void vp8_encode_frame(VP8_COMP *cpi)
 {
@@ -927,7 +929,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
 
         }
         else
-#endif
+#endif  // CONFIG_MULTITHREAD
         {
 
             /* for each macroblock row in image */
diff --git a/libvpx/vp8/encoder/encodeintra.c b/libvpx/vp8/encoder/encodeintra.c
index 938cc7ec..44be959c 100644
--- a/libvpx/vp8/encoder/encodeintra.c
+++ b/libvpx/vp8/encoder/encodeintra.c
@@ -13,6 +13,7 @@
 #include "vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vp8/encoder/quantize.h"
+#include "vp8/common/reconintra.h"
 #include "vp8/common/reconintra4x4.h"
 #include "encodemb.h"
 #include "vp8/common/invtrans.h"
diff --git a/libvpx/vp8/encoder/mcomp.c b/libvpx/vp8/encoder/mcomp.c
index f848e8fb..768c764c 100644
--- a/libvpx/vp8/encoder/mcomp.c
+++ b/libvpx/vp8/encoder/mcomp.c
@@ -20,6 +20,7 @@
 #include <math.h>
 #include "vp8/common/findnearmv.h"
 #include "vp8/common/common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 #ifdef VP8_ENTROPY_STATS
 static int mv_ref_ct [31] [4] [2];
@@ -223,14 +224,14 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     unsigned int quarteriters = 4;
     int thismse;
 
-    int minc = MAX(x->mv_col_min * 4,
-                   (ref_mv->as_mv.col >> 1) - ((1 << mvlong_width) - 1));
-    int maxc = MIN(x->mv_col_max * 4,
-                   (ref_mv->as_mv.col >> 1) + ((1 << mvlong_width) - 1));
-    int minr = MAX(x->mv_row_min * 4,
-                   (ref_mv->as_mv.row >> 1) - ((1 << mvlong_width) - 1));
-    int maxr = MIN(x->mv_row_max * 4,
-                   (ref_mv->as_mv.row >> 1) + ((1 << mvlong_width) - 1));
+    int minc = VPXMAX(x->mv_col_min * 4,
+                      (ref_mv->as_mv.col >> 1) - ((1 << mvlong_width) - 1));
+    int maxc = VPXMIN(x->mv_col_max * 4,
+                      (ref_mv->as_mv.col >> 1) + ((1 << mvlong_width) - 1));
+    int minr = VPXMAX(x->mv_row_min * 4,
+                      (ref_mv->as_mv.row >> 1) - ((1 << mvlong_width) - 1));
+    int maxr = VPXMIN(x->mv_row_max * 4,
+                      (ref_mv->as_mv.row >> 1) + ((1 << mvlong_width) - 1));
 
     int y_stride;
     int offset;
diff --git a/libvpx/vp8/encoder/mr_dissim.c b/libvpx/vp8/encoder/mr_dissim.c
index 8d96445f..886cba2f 100644
--- a/libvpx/vp8/encoder/mr_dissim.c
+++ b/libvpx/vp8/encoder/mr_dissim.c
@@ -13,6 +13,7 @@
 #include "vpx_config.h"
 #include "onyx_int.h"
 #include "mr_dissim.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "rdopt.h"
 #include "vp8/common/common.h"
@@ -192,11 +193,13 @@ void vp8_cal_dissimilarity(VP8_COMP *cpi)
                                 }
                             }
 
-                            mmvx = MAX(abs(min_mvx - here->mbmi.mv.as_mv.row),
-                                       abs(max_mvx - here->mbmi.mv.as_mv.row));
-                            mmvy = MAX(abs(min_mvy - here->mbmi.mv.as_mv.col),
-                                       abs(max_mvy - here->mbmi.mv.as_mv.col));
-                            dissim = MAX(mmvx, mmvy);
+                            mmvx = VPXMAX(
+                                abs(min_mvx - here->mbmi.mv.as_mv.row),
+                                abs(max_mvx - here->mbmi.mv.as_mv.row));
+                            mmvy = VPXMAX(
+                                abs(min_mvy - here->mbmi.mv.as_mv.col),
+                                abs(max_mvy - here->mbmi.mv.as_mv.col));
+                            dissim = VPXMAX(mmvx, mmvy);
                         }
                     }
 
diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c
index 5e05c8c6..df5bcf68 100644
--- a/libvpx/vp8/encoder/onyx_if.c
+++ b/libvpx/vp8/encoder/onyx_if.c
@@ -31,6 +31,7 @@
 #include "vp8/common/postproc.h"
 #endif
 #include "vpx_mem/vpx_mem.h"
+#include "vp8/common/reconintra.h"
 #include "vp8/common/swapyv12buffer.h"
 #include "vp8/common/threading.h"
 #include "vpx_ports/vpx_timer.h"
@@ -422,6 +423,16 @@ static void setup_features(VP8_COMP *cpi)
 
 static void dealloc_raw_frame_buffers(VP8_COMP *cpi);
 
+void vp8_initialize_enc(void)
+{
+    static volatile int init_done = 0;
+
+    if (!init_done) {
+        vpx_dsp_rtcd();
+        vp8_init_intra_predictors();
+        init_done = 1;
+    }
+}
 
 static void dealloc_compressor_data(VP8_COMP *cpi)
 {
@@ -516,41 +527,6 @@ static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned
 }
 
 
-static void segmentation_test_function(VP8_COMP *cpi)
-{
-    unsigned char *seg_map;
-    signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
-
-    // Create a temporary map for segmentation data.
-    CHECK_MEM_ERROR(seg_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
-
-    // Set the segmentation Map
-    set_segmentation_map(cpi, seg_map);
-
-    // Activate segmentation.
-    enable_segmentation(cpi);
-
-    // Set up the quant segment data
-    feature_data[MB_LVL_ALT_Q][0] = 0;
-    feature_data[MB_LVL_ALT_Q][1] = 4;
-    feature_data[MB_LVL_ALT_Q][2] = 0;
-    feature_data[MB_LVL_ALT_Q][3] = 0;
-    // Set up the loop segment data
-    feature_data[MB_LVL_ALT_LF][0] = 0;
-    feature_data[MB_LVL_ALT_LF][1] = 0;
-    feature_data[MB_LVL_ALT_LF][2] = 0;
-    feature_data[MB_LVL_ALT_LF][3] = 0;
-
-    // Initialise the feature data structure
-    // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
-    set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
-
-    // Delete sementation map
-    vpx_free(seg_map);
-
-    seg_map = 0;
-}
-
 /* A simple function to cyclically refresh the background at a lower Q */
 static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
 {
@@ -913,7 +889,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     Speed = cpi->Speed;
     switch (Mode)
     {
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
     case 0: /* best quality mode */
         sf->first_step = 0;
         sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
@@ -1953,7 +1929,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
      * Currently this is tied to error resilliant mode
      */
     cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode;
-    cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 5;
+    cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 7;
     if (cpi->oxcf.number_of_layers == 1) {
         cpi->cyclic_refresh_mode_max_mbs_perframe =
             (cpi->common.mb_rows * cpi->common.mb_cols) / 20;
@@ -2065,7 +2041,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
 
     cpi->output_pkt_list = oxcf->output_pkt_list;
 
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
 
     if (cpi->pass == 1)
     {
@@ -2227,7 +2203,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
 
     if (cpi && (cpi->common.current_video_frame > 0))
     {
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
 
         if (cpi->pass == 2)
         {
@@ -3018,6 +2994,7 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
 }
 
 
+#if !CONFIG_REALTIME_ONLY
 /* 1 = key, 0 = inter */
 static int decide_key_frame(VP8_COMP *cpi)
 {
@@ -3085,7 +3062,6 @@ static int decide_key_frame(VP8_COMP *cpi)
 
 }
 
-#if !(CONFIG_REALTIME_ONLY)
 static void Pass1Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags)
 {
     (void) size;
@@ -3131,6 +3107,7 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
 #endif
 /* return of 0 means drop frame */
 
+#if !CONFIG_REALTIME_ONLY
 /* Function to test for conditions that indeicate we should loop
  * back and recode a frame.
  */
@@ -3180,6 +3157,7 @@ static int recode_loop_test( VP8_COMP *cpi,
 
     return force_recode;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void update_reference_frames(VP8_COMP *cpi)
 {
@@ -3601,7 +3579,7 @@ static void encode_frame_to_data_rate
     VP8_COMMON *cm = &cpi->common;
     int active_worst_qchanged = 0;
 
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
     int q_low;
     int q_high;
     int zbin_oq_high;
@@ -3640,7 +3618,7 @@ static void encode_frame_to_data_rate
     /* For an alt ref frame in 2 pass we skip the call to the second pass
      * function that sets the target bandwidth
      */
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
 
     if (cpi->pass == 2)
     {
@@ -4149,7 +4127,7 @@ static void encode_frame_to_data_rate
     /* Determine initial Q to try */
     Q = vp8_regulate_q(cpi, cpi->this_frame_target);
 
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
 
     /* Set highest allowed value for Zbin over quant */
     if (cm->frame_type == KEY_FRAME)
@@ -4179,7 +4157,7 @@ static void encode_frame_to_data_rate
 
     vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
 
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
     /* Limit Q range for the adaptive loop. */
     bottom_index = cpi->active_best_quality;
     top_index    = cpi->active_worst_quality;
@@ -4410,7 +4388,7 @@ static void encode_frame_to_data_rate
         if (cpi->pass != 2 && cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME
             && cpi->compressor_speed != 2)
         {
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
             if (decide_key_frame(cpi))
             {
                 /* Reset all our sizing numbers and recode */
@@ -4466,9 +4444,9 @@ static void encode_frame_to_data_rate
                 /* Assume 1 qstep = about 4% on frame size. */
                 over_size_percent = (int)(over_size_percent * 0.96);
             }
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
             top_index = cpi->active_worst_quality;
-#endif
+#endif  // !CONFIG_REALTIME_ONLY
             /* If we have updated the active max Q do not call
              * vp8_update_rate_correction_factors() this loop.
              */
@@ -4477,7 +4455,7 @@ static void encode_frame_to_data_rate
         else
             active_worst_qchanged = 0;
 
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
         /* Special case handling for forced key frames */
         if ( (cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced )
         {
@@ -5215,7 +5193,7 @@ static void encode_frame_to_data_rate
 
 
 }
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
 static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned char * dest_end, unsigned int *frame_flags)
 {
 
@@ -5299,7 +5277,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
 
     cpi->source = NULL;
 
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
     /* Should we code an alternate reference frame */
     if (cpi->oxcf.error_resilient_mode == 0 &&
         cpi->oxcf.play_alternate &&
@@ -5367,7 +5345,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
     else
     {
         *size = 0;
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
 
         if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done)
         {
@@ -5560,7 +5538,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
 
         assert(i < NUM_YV12_BUFFERS );
     }
-#if !(CONFIG_REALTIME_ONLY)
+#if !CONFIG_REALTIME_ONLY
 
     if (cpi->pass == 1)
     {
diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h
index 8beba27f..317e4b9e 100644
--- a/libvpx/vp8/encoder/onyx_int.h
+++ b/libvpx/vp8/encoder/onyx_int.h
@@ -716,6 +716,8 @@ typedef struct VP8_COMP
     } rd_costs;
 } VP8_COMP;
 
+void vp8_initialize_enc(void);
+
 void vp8_alloc_compressor_data(VP8_COMP *cpi);
 int vp8_reverse_trans(int x);
 void vp8_new_framerate(VP8_COMP *cpi, double framerate);
diff --git a/libvpx/vp8/encoder/pickinter.c b/libvpx/vp8/encoder/pickinter.c
index 5ce98ad2..d0fff3f0 100644
--- a/libvpx/vp8/encoder/pickinter.c
+++ b/libvpx/vp8/encoder/pickinter.c
@@ -21,10 +21,12 @@
 #include "vp8/common/findnearmv.h"
 #include "encodemb.h"
 #include "vp8/common/reconinter.h"
+#include "vp8/common/reconintra.h"
 #include "vp8/common/reconintra4x4.h"
 #include "vpx_dsp/variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #if CONFIG_TEMPORAL_DENOISING
 #include "denoising.h"
@@ -72,7 +74,7 @@ static int macroblock_corner_grad(unsigned char* signal, int stride,
   int y2 = signal[offsetx * stride + offsety + sgny];
   int y3 = signal[(offsetx + sgnx) * stride + offsety];
   int y4 = signal[(offsetx + sgnx) * stride + offsety + sgny];
-  return MAX(MAX(abs(y1 - y2), abs(y1 - y3)), abs(y1 - y4));
+  return VPXMAX(VPXMAX(abs(y1 - y2), abs(y1 - y3)), abs(y1 - y4));
 }
 
 static int check_dot_artifact_candidate(VP8_COMP *cpi,
@@ -813,9 +815,18 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
     // Check if current macroblock is in skin area.
     {
-    const int y = x->src.y_buffer[7 * x->src.y_stride + 7];
-    const int cb = x->src.u_buffer[3 * x->src.uv_stride + 3];
-    const int cr = x->src.v_buffer[3 * x->src.uv_stride + 3];
+    const int y = (x->src.y_buffer[7 * x->src.y_stride + 7] +
+        x->src.y_buffer[7 * x->src.y_stride + 8] +
+        x->src.y_buffer[8 * x->src.y_stride + 7] +
+        x->src.y_buffer[8 * x->src.y_stride + 8]) >> 2;
+    const int cb = (x->src.u_buffer[3 * x->src.uv_stride + 3] +
+        x->src.u_buffer[3 * x->src.uv_stride + 4] +
+        x->src.u_buffer[4 * x->src.uv_stride + 3] +
+        x->src.u_buffer[4 * x->src.uv_stride + 4]) >> 2;
+    const int cr = (x->src.v_buffer[3 * x->src.uv_stride + 3] +
+        x->src.v_buffer[3 * x->src.uv_stride + 4] +
+        x->src.v_buffer[4 * x->src.uv_stride + 3] +
+        x->src.v_buffer[4 * x->src.uv_stride + 4]) >> 2;
     x->is_skin = 0;
     if (!cpi->oxcf.screen_content_mode)
       x->is_skin = is_skin_color(y, cb, cr);
@@ -824,7 +835,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     if (cpi->oxcf.noise_sensitivity) {
       // Under aggressive denoising mode, should we use skin map to reduce denoiser
       // and ZEROMV bias? Will need to revisit the accuracy of this detection for
-      // very noisy input. For now keep this as is (i.e., don't turn it off). 
+      // very noisy input. For now keep this as is (i.e., don't turn it off).
       // if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive)
       //   x->is_skin = 0;
     }
@@ -874,7 +885,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
     /* If the frame has big static background and current MB is in low
     *  motion area, its mode decision is biased to ZEROMV mode.
-    *  No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12). 
+    *  No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12).
     *  At such speed settings, ZEROMV is already heavily favored.
     */
     if (cpi->Speed < 12) {
@@ -1136,8 +1147,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_MULTI_RES_ENCODING
             if (parent_ref_valid && (parent_ref_frame == this_ref_frame) &&
                 dissim <= 2 &&
-                MAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row),
-                    abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <= 4)
+                VPXMAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row),
+                       abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <=
+                    4)
             {
                 d->bmi.mv.as_int = mvp_full.as_int;
                 mode_mv[NEWMV].as_int = mvp_full.as_int;
diff --git a/libvpx/vp8/encoder/ratectrl.c b/libvpx/vp8/encoder/ratectrl.c
index e8796a1f..7da3d71a 100644
--- a/libvpx/vp8/encoder/ratectrl.c
+++ b/libvpx/vp8/encoder/ratectrl.c
@@ -22,6 +22,7 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/systemdependent.h"
 #include "encodemv.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 
 #define MIN_BPB_FACTOR          0.01
@@ -380,7 +381,8 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
         int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
         /* Boost depends somewhat on frame rate: only used for 1 layer case. */
         if (cpi->oxcf.number_of_layers == 1) {
-          kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
+          kf_boost = VPXMAX(initial_boost,
+                            (int)(2 * cpi->output_framerate - 16));
         }
         else {
           /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
@@ -1591,11 +1593,38 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
     if (Q < thresh_qp &&
         cpi->projected_frame_size > thresh_rate &&
         pred_err_mb > thresh_pred_err_mb) {
+      double new_correction_factor = cpi->rate_correction_factor;
+      const int target_size = cpi->av_per_frame_bandwidth;
+      int target_bits_per_mb;
       // Drop this frame: advance frame counters, and set force_maxqp flag.
       cpi->common.current_video_frame++;
       cpi->frames_since_key++;
       // Flag to indicate we will force next frame to be encoded at max QP.
       cpi->force_maxqp = 1;
+      // Reset the buffer levels.
+      cpi->buffer_level = cpi->oxcf.optimal_buffer_level;
+      cpi->bits_off_target = cpi->oxcf.optimal_buffer_level;
+      // Compute a new rate correction factor, corresponding to the current
+      // target frame size and max_QP, and adjust the rate correction factor
+      // upwards, if needed.
+      // This is to prevent a bad state where the re-encoded frame at max_QP
+      // undershoots significantly, and then we end up dropping every other
+      // frame because the QP/rate_correction_factor may have been too low
+      // before the drop and then takes too long to come up.
+      if (target_size >= (INT_MAX >> BPER_MB_NORMBITS))
+        target_bits_per_mb =
+            (target_size / cpi->common.MBs) << BPER_MB_NORMBITS;
+      else
+        target_bits_per_mb =
+            (target_size << BPER_MB_NORMBITS) / cpi->common.MBs;
+      // Rate correction factor based on target_size_per_mb and max_QP.
+      new_correction_factor = (double)target_bits_per_mb /
+          (double)vp8_bits_per_mb[INTER_FRAME][cpi->worst_quality];
+      if (new_correction_factor > cpi->rate_correction_factor)
+        cpi->rate_correction_factor =
+            VPXMIN(2.0 * cpi->rate_correction_factor, new_correction_factor);
+      if (cpi->rate_correction_factor > MAX_BPB_FACTOR)
+        cpi->rate_correction_factor = MAX_BPB_FACTOR;
       return 1;
     } else {
       cpi->force_maxqp = 0;
diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c
index fdff378b..ab0ad159 100644
--- a/libvpx/vp8/encoder/rdopt.c
+++ b/libvpx/vp8/encoder/rdopt.c
@@ -24,6 +24,7 @@
 #include "pickinter.h"
 #include "vp8/common/entropymode.h"
 #include "vp8/common/reconinter.h"
+#include "vp8/common/reconintra.h"
 #include "vp8/common/reconintra4x4.h"
 #include "vp8/common/findnearmv.h"
 #include "vp8/common/quant_common.h"
diff --git a/libvpx/vp8/encoder/rdopt.h b/libvpx/vp8/encoder/rdopt.h
index b4fcd10b..1cb1a072 100644
--- a/libvpx/vp8/encoder/rdopt.h
+++ b/libvpx/vp8/encoder/rdopt.h
@@ -12,13 +12,15 @@
 #ifndef VP8_ENCODER_RDOPT_H_
 #define VP8_ENCODER_RDOPT_H_
 
+#include "./vpx_config.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
 
-static void insertsortmv(int arr[], int len)
+static INLINE void insertsortmv(int arr[], int len)
 {
     int i, j, k;
 
@@ -41,7 +43,7 @@ static void insertsortmv(int arr[], int len)
     }
 }
 
-static void insertsortsad(int arr[],int idx[], int len)
+static INLINE void insertsortsad(int arr[],int idx[], int len)
 {
     int i, j, k;
 
@@ -77,10 +79,10 @@ extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x,
 extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);
 
 
-static void get_plane_pointers(const YV12_BUFFER_CONFIG *fb,
-                               unsigned char            *plane[3],
-                               unsigned int              recon_yoffset,
-                               unsigned int              recon_uvoffset)
+static INLINE void get_plane_pointers(const YV12_BUFFER_CONFIG *fb,
+                                      unsigned char *plane[3],
+                                      unsigned int recon_yoffset,
+                                      unsigned int recon_uvoffset)
 {
     plane[0] = fb->y_buffer + recon_yoffset;
     plane[1] = fb->u_buffer + recon_uvoffset;
@@ -88,10 +90,10 @@ static void get_plane_pointers(const YV12_BUFFER_CONFIG *fb,
 }
 
 
-static void get_predictor_pointers(const VP8_COMP *cpi,
-                                       unsigned char  *plane[4][3],
-                                       unsigned int    recon_yoffset,
-                                       unsigned int    recon_uvoffset)
+static INLINE void get_predictor_pointers(const VP8_COMP *cpi,
+                                          unsigned char *plane[4][3],
+                                          unsigned int recon_yoffset,
+                                          unsigned int recon_uvoffset)
 {
     if (cpi->ref_frame_flags & VP8_LAST_FRAME)
         get_plane_pointers(&cpi->common.yv12_fb[cpi->common.lst_fb_idx],
@@ -107,8 +109,8 @@ static void get_predictor_pointers(const VP8_COMP *cpi,
 }
 
 
-static void get_reference_search_order(const VP8_COMP *cpi,
-                                           int             ref_frame_map[4])
+static INLINE void get_reference_search_order(const VP8_COMP *cpi,
+                                              int ref_frame_map[4])
 {
     int i=0;
 
diff --git a/libvpx/vp8/encoder/treewriter.h b/libvpx/vp8/encoder/treewriter.h
index cfb2730a..2debf927 100644
--- a/libvpx/vp8/encoder/treewriter.h
+++ b/libvpx/vp8/encoder/treewriter.h
@@ -15,6 +15,7 @@
 /* Trees map alphabets into huffman-like codes suitable for an arithmetic
    bit coder.  Timothy S Murphy  11 October 2004 */
 
+#include "./vpx_config.h"
 #include "vp8/common/treecoder.h"
 
 #include "boolhuff.h"       /* for now */
@@ -46,7 +47,7 @@ typedef BOOL_CODER vp8_writer;
 
 /* Both of these return bits, not scaled bits. */
 
-static unsigned int vp8_cost_branch(const unsigned int ct[2], vp8_prob p)
+static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2], vp8_prob p)
 {
     /* Imitate existing calculation */
 
@@ -76,7 +77,7 @@ static void vp8_treed_write
     }
     while (n);
 }
-static void vp8_write_token
+static INLINE void vp8_write_token
 (
     vp8_writer *const w,
     vp8_tree t,
@@ -107,7 +108,7 @@ static int vp8_treed_cost(
 
     return c;
 }
-static int vp8_cost_token
+static INLINE int vp8_cost_token
 (
     vp8_tree t,
     const vp8_prob *const p,
diff --git a/libvpx/vp8/vp8_common.mk b/libvpx/vp8/vp8_common.mk
index 3ad11c77..4c4e8562 100644
--- a/libvpx/vp8/vp8_common.mk
+++ b/libvpx/vp8/vp8_common.mk
@@ -45,6 +45,7 @@ VP8_COMMON_SRCS-yes += common/mv.h
 VP8_COMMON_SRCS-yes += common/onyxc_int.h
 VP8_COMMON_SRCS-yes += common/quant_common.h
 VP8_COMMON_SRCS-yes += common/reconinter.h
+VP8_COMMON_SRCS-yes += common/reconintra.h
 VP8_COMMON_SRCS-yes += common/reconintra4x4.h
 VP8_COMMON_SRCS-yes += common/rtcd.c
 VP8_COMMON_SRCS-yes += common/rtcd_defs.pl
@@ -88,7 +89,6 @@ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
@@ -118,7 +118,6 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/bilinear_filter_msa.c
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/copymem_msa.c
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct_msa.c
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c
-VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/reconintra_msa.c
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
 
@@ -146,7 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/loopfilter_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/simpleloopfilter_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/sixtappredict8x4_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/intra4x4_predict_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c
@@ -165,7 +163,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_loopfilter_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimpleverticaledge_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/reconintra_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict_neon.c
 
diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c
index fe88cd4b..c125ae84 100644
--- a/libvpx/vp8/vp8_cx_iface.c
+++ b/libvpx/vp8/vp8_cx_iface.c
@@ -17,6 +17,7 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
 #include "vp8/encoder/onyx_int.h"
 #include "vpx/vp8cx.h"
 #include "vp8/encoder/firstpass.h"
@@ -237,7 +238,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
         RANGE_CHECK_HI(cfg, ts_periodicity, 16);
 
         for (i=1; i<cfg->ts_number_layers; i++)
-            if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i-1] && 
+            if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i-1] &&
                 cfg->rc_target_bitrate > 0)
                 ERROR("ts_target_bitrate entries are not strictly increasing");
 
@@ -693,6 +694,8 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
         else
             ctx->priv->enc.total_encoders   = 1;
 
+        once(vp8_initialize_enc);
+
         res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0);
 
         if (!res)
@@ -879,7 +882,8 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
     }
     ctx->control_frame_flags = 0;
 
-    res = set_reference_and_update(ctx, flags);
+    if (!res)
+        res = set_reference_and_update(ctx, flags);
 
     /* Handle fixed keyframe intervals */
     if (ctx->cfg.kf_mode == VPX_KF_AUTO
@@ -1273,9 +1277,6 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] =
     {VP8_SET_REFERENCE,                 vp8e_set_reference},
     {VP8_COPY_REFERENCE,                vp8e_get_reference},
     {VP8_SET_POSTPROC,                  vp8e_set_previewpp},
-    {VP8E_UPD_ENTROPY,                  vp8e_update_entropy},
-    {VP8E_UPD_REFERENCE,                vp8e_update_reference},
-    {VP8E_USE_REFERENCE,                vp8e_use_reference},
     {VP8E_SET_FRAME_FLAGS,              vp8e_set_frame_flags},
     {VP8E_SET_TEMPORAL_LAYER_ID,        vp8e_set_temporal_layer_id},
     {VP8E_SET_ROI_MAP,                  vp8e_set_roi_map},
diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c
index 72e4770c..a12a2ad0 100644
--- a/libvpx/vp8/vp8_dx_iface.c
+++ b/libvpx/vp8/vp8_dx_iface.c
@@ -22,6 +22,7 @@
 #include "common/common.h"
 #include "common/onyxd.h"
 #include "decoder/onyxd_int.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #if CONFIG_ERROR_CONCEALMENT
 #include "decoder/error_concealment.h"
@@ -42,8 +43,6 @@ typedef enum
 } mem_seg_id_t;
 #define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
 
-static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);
-
 struct vpx_codec_alg_priv
 {
     vpx_codec_priv_t        base;
@@ -68,18 +67,6 @@ struct vpx_codec_alg_priv
     FRAGMENT_DATA           fragments;
 };
 
-static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t flags)
-{
-    /* Although this declaration is constant, we can't use it in the requested
-     * segments list because we want to define the requested segments list
-     * before defining the private type (so that the number of memory maps is
-     * known)
-     */
-    (void)si;
-    (void)flags;
-    return sizeof(vpx_codec_alg_priv_t);
-}
-
 static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
 {
     vpx_codec_alg_priv_t *priv =
@@ -180,7 +167,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
         const uint8_t *clear = data;
         if (decrypt_cb)
         {
-            int n = MIN(sizeof(clear_buffer), data_sz);
+            int n = VPXMIN(sizeof(clear_buffer), data_sz);
             decrypt_cb(decrypt_state, data, clear_buffer, n);
             clear = clear_buffer;
         }
@@ -259,8 +246,8 @@ static void yuvconfig2image(vpx_image_t               *img,
     img->fmt = VPX_IMG_FMT_I420;
     img->w = yv12->y_stride;
     img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
-    img->d_w = yv12->y_width;
-    img->d_h = yv12->y_height;
+    img->d_w = img->r_w = yv12->y_width;
+    img->d_h = img->r_h = yv12->y_height;
     img->x_chroma_shift = 1;
     img->y_chroma_shift = 1;
     img->planes[VPX_PLANE_Y] = yv12->y_buffer;
diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c
index ac417b69..24c6c54e 100644
--- a/libvpx/vp9/common/vp9_alloccommon.c
+++ b/libvpx/vp9/common/vp9_alloccommon.c
@@ -115,6 +115,8 @@ void vp9_free_context_buffers(VP9_COMMON *cm) {
   cm->above_context = NULL;
   vpx_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
+  vpx_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
 }
 
 int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
@@ -149,6 +151,16 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->above_context_alloc_cols = cm->mi_cols;
   }
 
+  vpx_free(cm->lf.lfm);
+
+  // Each lfm holds bit masks for all the 8x8 blocks in a 64x64 region.  The
+  // stride and rows are rounded up / truncated to a multiple of 8.
+  cm->lf.lfm_stride = (cm->mi_cols + (MI_BLOCK_SIZE - 1)) >> 3;
+  cm->lf.lfm = (LOOP_FILTER_MASK *)vpx_calloc(
+      ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride,
+      sizeof(*cm->lf.lfm));
+  if (!cm->lf.lfm) goto fail;
+
   return 0;
 
  fail:
diff --git a/libvpx/vp9/common/vp9_blockd.c b/libvpx/vp9/common/vp9_blockd.c
index e8334fc8..0e104ee5 100644
--- a/libvpx/vp9/common/vp9_blockd.c
+++ b/libvpx/vp9/common/vp9_blockd.c
@@ -129,7 +129,6 @@ void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) {
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y;
     xd->plane[i].subsampling_x = i ? ss_x : 0;
     xd->plane[i].subsampling_y = i ? ss_y : 0;
   }
diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h
index d776b440..61eb5916 100644
--- a/libvpx/vp9/common/vp9_blockd.h
+++ b/libvpx/vp9/common/vp9_blockd.h
@@ -14,6 +14,7 @@
 
 #include "./vpx_config.h"
 
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 #include "vpx_scale/yv12config.h"
 
@@ -119,7 +120,6 @@ struct buf_2d {
 
 struct macroblockd_plane {
   tran_low_t *dqcoeff;
-  PLANE_TYPE plane_type;
   int subsampling_x;
   int subsampling_y;
   struct buf_2d dst;
@@ -175,7 +175,6 @@ typedef struct macroblockd {
   int mb_to_bottom_edge;
 
   FRAME_CONTEXT *fc;
-  int frame_parallel_decoding_mode;
 
   /* pointers to reference frames */
   RefBuffer *block_refs[2];
@@ -200,6 +199,10 @@ typedef struct macroblockd {
   struct vpx_internal_error_info *error_info;
 } MACROBLOCKD;
 
+static INLINE PLANE_TYPE get_plane_type(int plane) {
+  return (PLANE_TYPE)(plane > 0);
+}
+
 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
                                      PARTITION_TYPE partition) {
   return subsize_lookup[partition][bsize];
@@ -235,7 +238,7 @@ static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize,
     return TX_4X4;
   } else {
     const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][xss][yss];
-    return MIN(y_tx_size, max_txsize_lookup[plane_bsize]);
+    return VPXMIN(y_tx_size, max_txsize_lookup[plane_bsize]);
   }
 }
 
diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c
index 0bf7cbcc..a6dae6a1 100644
--- a/libvpx/vp9/common/vp9_common_data.c
+++ b/libvpx/vp9/common/vp9_common_data.c
@@ -9,6 +9,7 @@
  */
 
 #include "vp9/common/vp9_common_data.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 // Log 2 conversion lookup tables for block width and height
 const uint8_t b_width_log2_lookup[BLOCK_SIZES] =
@@ -27,7 +28,7 @@ const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
 const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
 
-// MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize)))
+// VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize)))
 const uint8_t size_group_lookup[BLOCK_SIZES] =
   {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};
 
diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h
index a1746bce..21611ed6 100644
--- a/libvpx/vp9/common/vp9_entropy.h
+++ b/libvpx/vp9/common/vp9_entropy.h
@@ -75,21 +75,6 @@ DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob_high12[18]);
 
 #define EOB_MODEL_TOKEN 3
 
-typedef struct {
-  const vpx_tree_index *tree;
-  const vpx_prob *prob;
-  int len;
-  int base_val;
-  const int16_t *cost;
-} vp9_extra_bit;
-
-// indexed by token value
-extern const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS];
-#if CONFIG_VP9_HIGHBITDEPTH
-extern const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS];
-extern const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS];
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 #define DCT_MAX_VALUE           16384
 #if CONFIG_VP9_HIGHBITDEPTH
 #define DCT_MAX_VALUE_HIGH10    65536
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index 0915918e..b8a11322 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -13,6 +13,7 @@
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_reconinter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
@@ -775,7 +776,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
   // an 8x8 in that the internal ones can be skipped and don't depend on
   // the prediction block size.
   if (tx_size_y == TX_4X4)
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
+    *int_4x4_y |= size_mask[block_size] << shift_y;
 
   if (tx_size_uv == TX_4X4)
     *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
@@ -821,7 +822,121 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
               left_64x64_txform_mask[tx_size_y]) << shift_y;
 
   if (tx_size_y == TX_4X4)
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
+    *int_4x4_y |= size_mask[block_size] << shift_y;
+}
+
+void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row,
+                     const int mi_col, LOOP_FILTER_MASK *lfm) {
+  int i;
+
+  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
+  // for 32x32 transforms also.
+  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
+  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
+  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
+  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
+
+  // We do at least 8 tap filter on every 32x32 even if the transform size
+  // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and
+  // remove it from the 4x4.
+  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
+  lfm->left_y[TX_4X4] &= ~left_border;
+  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
+  lfm->above_y[TX_4X4] &= ~above_border;
+  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
+  lfm->left_uv[TX_4X4] &= ~left_border_uv;
+  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
+  lfm->above_uv[TX_4X4] &= ~above_border_uv;
+
+  // We do some special edge handling.
+  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
+    const uint64_t rows = cm->mi_rows - mi_row;
+
+    // Each pixel inside the border gets a 1,
+    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
+    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
+
+    // Remove values completely outside our border.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+    lfm->int_4x4_uv &= mask_uv;
+
+    // We don't apply a wide loop filter on the last uv block row. If set
+    // apply the shorter one instead.
+    if (rows == 1) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
+      lfm->above_uv[TX_16X16] = 0;
+    }
+    if (rows == 5) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
+      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
+    }
+  }
+
+  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
+    const uint64_t columns = cm->mi_cols - mi_col;
+
+    // Each pixel inside the border gets a 1, the multiply copies the border
+    // to where we need it.
+    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101ULL;
+    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
+
+    // Internal edges are not applied on the last column of the image so
+    // we mask 1 more for the internal edges
+    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
+
+    // Remove the bits outside the image edge.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+    lfm->int_4x4_uv &= mask_uv_int;
+
+    // We don't apply a wide loop filter on the last uv column. If set
+    // apply the shorter one instead.
+    if (columns == 1) {
+      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
+      lfm->left_uv[TX_16X16] = 0;
+    }
+    if (columns == 5) {
+      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
+      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
+    }
+  }
+  // We don't apply a loop filter on the first column in the image, mask that
+  // out.
+  if (mi_col == 0) {
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= 0xfefefefefefefefeULL;
+      lfm->left_uv[i] &= 0xeeee;
+    }
+  }
+
+  // Assert if we try to apply 2 different loop filters at the same position.
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));
+  assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));
+  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
+  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
+  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
+  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
+  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
 }
 
 // This function sets up the bit masks for the entire 64x64 region represented
@@ -854,7 +969,6 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
   const int shift_8_y[] = {0, 1, 8, 9};
   const int shift_32_uv[] = {0, 2, 8, 10};
   const int shift_16_uv[] = {0, 1, 4, 5};
-  int i;
   const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ?
                         cm->mi_rows - mi_row : MI_BLOCK_SIZE);
   const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
@@ -969,114 +1083,8 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
       }
       break;
   }
-  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
-  // for 32x32 transforms also.
-  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
-  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
-  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
-  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
 
-  // We do at least 8 tap filter on every 32x32 even if the transform size
-  // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and
-  // remove it from the 4x4.
-  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
-  lfm->left_y[TX_4X4] &= ~left_border;
-  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
-  lfm->above_y[TX_4X4] &= ~above_border;
-  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
-  lfm->left_uv[TX_4X4] &= ~left_border_uv;
-  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
-  lfm->above_uv[TX_4X4] &= ~above_border_uv;
-
-  // We do some special edge handling.
-  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
-    const uint64_t rows = cm->mi_rows - mi_row;
-
-    // Each pixel inside the border gets a 1,
-    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
-    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
-
-    // Remove values completely outside our border.
-    for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= mask_y;
-      lfm->above_y[i] &= mask_y;
-      lfm->left_uv[i] &= mask_uv;
-      lfm->above_uv[i] &= mask_uv;
-    }
-    lfm->int_4x4_y &= mask_y;
-    lfm->int_4x4_uv &= mask_uv;
-
-    // We don't apply a wide loop filter on the last uv block row. If set
-    // apply the shorter one instead.
-    if (rows == 1) {
-      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
-      lfm->above_uv[TX_16X16] = 0;
-    }
-    if (rows == 5) {
-      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
-      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
-    }
-  }
-
-  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
-    const uint64_t columns = cm->mi_cols - mi_col;
-
-    // Each pixel inside the border gets a 1, the multiply copies the border
-    // to where we need it.
-    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101ULL;
-    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
-
-    // Internal edges are not applied on the last column of the image so
-    // we mask 1 more for the internal edges
-    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
-
-    // Remove the bits outside the image edge.
-    for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= mask_y;
-      lfm->above_y[i] &= mask_y;
-      lfm->left_uv[i] &= mask_uv;
-      lfm->above_uv[i] &= mask_uv;
-    }
-    lfm->int_4x4_y &= mask_y;
-    lfm->int_4x4_uv &= mask_uv_int;
-
-    // We don't apply a wide loop filter on the last uv column. If set
-    // apply the shorter one instead.
-    if (columns == 1) {
-      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
-      lfm->left_uv[TX_16X16] = 0;
-    }
-    if (columns == 5) {
-      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
-      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
-    }
-  }
-  // We don't apply a loop filter on the first column in the image, mask that
-  // out.
-  if (mi_col == 0) {
-    for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= 0xfefefefefefefefeULL;
-      lfm->left_uv[i] &= 0xeeee;
-    }
-  }
-
-  // Assert if we try to apply 2 different loop filters at the same position.
-  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));
-  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));
-  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));
-  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));
-  assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));
-  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
-  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
-  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));
-  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
-  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
-  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
-  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));
-  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
-  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
-  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
-  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
+  vp9_adjust_mask(cm, mi_row, mi_col, lfm);
 }
 
 static void filter_selectively_vert(uint8_t *s, int pitch,
@@ -1188,9 +1196,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
       const int block_edge_above = (num_4x4_blocks_high_lookup[sb_type] > 1) ?
           !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1;
       const int skip_this_r = skip_this && !block_edge_above;
-      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                            ? get_uv_tx_size(&mi[0].mbmi, plane)
-                            : mi[0].mbmi.tx_size;
+      const TX_SIZE tx_size = get_uv_tx_size(&mi[0].mbmi, plane);
       const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
@@ -1427,6 +1433,7 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
   struct buf_2d *const dst = &plane->dst;
   uint8_t *const dst0 = dst->buf;
   int r, c;
+  uint8_t lfl_uv[16];
 
   uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
   uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
@@ -1437,11 +1444,9 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
 
   // Vertical pass: do 2 rows at one time
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
-    if (plane->plane_type == 1) {
-      for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
-        lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
-        lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
-      }
+    for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
+      lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
+      lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
     }
 
     {
@@ -1456,18 +1461,18 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
         highbd_filter_selectively_vert_row2(
             plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
             mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+            &lfl_uv[r << 1], (int)cm->bit_depth);
       } else {
         filter_selectively_vert_row2(
             plane->subsampling_x, dst->buf, dst->stride,
             mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfm->lfl_uv[r << 1]);
+            &lfl_uv[r << 1]);
       }
 #else
       filter_selectively_vert_row2(
           plane->subsampling_x, dst->buf, dst->stride,
           mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_uv[r << 1]);
+          &lfl_uv[r << 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
       dst->buf += 16 * dst->stride;
@@ -1508,16 +1513,16 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
       highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
                                       dst->stride, mask_16x16_r, mask_8x8_r,
                                       mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                                      &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+                                      &lfl_uv[r << 1], (int)cm->bit_depth);
     } else {
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                                mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                               &lfm->lfl_uv[r << 1]);
+                               &lfl_uv[r << 1]);
     }
 #else
     filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                              mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                             &lfm->lfl_uv[r << 1]);
+                             &lfl_uv[r << 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     dst->buf += 8 * dst->stride;
@@ -1528,13 +1533,11 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
   }
 }
 
-void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
-                          VP9_COMMON *cm,
-                          struct macroblockd_plane planes[MAX_MB_PLANE],
-                          int start, int stop, int y_only) {
+static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, VP9_COMMON *cm,
+                             struct macroblockd_plane planes[MAX_MB_PLANE],
+                             int start, int stop, int y_only) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   enum lf_path path;
-  LOOP_FILTER_MASK lfm;
   int mi_row, mi_col;
 
   if (y_only)
@@ -1548,24 +1551,24 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
 
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0);
 
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) {
       int plane;
 
       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
       // TODO(JBB): Make setup_mask work for non 420.
-      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
-                     &lfm);
+      vp9_adjust_mask(cm, mi_row, mi_col, lfm);
 
-      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm);
       for (plane = 1; plane < num_planes; ++plane) {
         switch (path) {
           case LF_PATH_420:
-            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm);
             break;
           case LF_PATH_444:
-            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm);
             break;
           case LF_PATH_SLOW:
             vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
@@ -1588,13 +1591,135 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
   if (partial_frame && cm->mi_rows > 8) {
     start_mi_row = cm->mi_rows >> 1;
     start_mi_row &= 0xfffffff8;
-    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
+    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row, y_only);
+}
+
+// Used by the encoder to build the loopfilter masks.
+void vp9_build_mask_frame(VP9_COMMON *cm, int frame_filter_level,
+                          int partial_frame) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+  int mi_col, mi_row;
+  if (!frame_filter_level) return;
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
   }
   end_mi_row = start_mi_row + mi_rows_to_filter;
+
   vp9_loop_filter_frame_init(cm, frame_filter_level);
-  vp9_loop_filter_rows(frame, cm, xd->plane,
-                       start_mi_row, end_mi_row,
-                       y_only);
+
+  for (mi_row = start_mi_row; mi_row < end_mi_row; mi_row += MI_BLOCK_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      // vp9_setup_mask() zeros lfm
+      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+                     get_lfm(&cm->lf, mi_row, mi_col));
+    }
+  }
+}
+
+// 8x8 blocks in a superblock.  A "1" represents the first block in a 16x16
+// or greater area.
+static const uint8_t first_block_in_16x16[8][8] = {
+  {1, 0, 1, 0, 1, 0, 1, 0},
+  {0, 0, 0, 0, 0, 0, 0, 0},
+  {1, 0, 1, 0, 1, 0, 1, 0},
+  {0, 0, 0, 0, 0, 0, 0, 0},
+  {1, 0, 1, 0, 1, 0, 1, 0},
+  {0, 0, 0, 0, 0, 0, 0, 0},
+  {1, 0, 1, 0, 1, 0, 1, 0},
+  {0, 0, 0, 0, 0, 0, 0, 0}
+};
+
+// This function sets up the bit masks for a block represented
+// by mi_row, mi_col in a 64x64 region.
+// TODO(SJL): This function only works for yv12.
+void vp9_build_mask(VP9_COMMON *cm, const MB_MODE_INFO *mbmi, int mi_row,
+                    int mi_col, int bw, int bh) {
+  const BLOCK_SIZE block_size = mbmi->sb_type;
+  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const loop_filter_info_n *const lfi_n = &cm->lf_info;
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);
+  LOOP_FILTER_MASK *const lfm = get_lfm(&cm->lf, mi_row, mi_col);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];
+  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];
+  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;
+  const int row_in_sb = (mi_row & 7);
+  const int col_in_sb = (mi_col & 7);
+  const int shift_y = col_in_sb + (row_in_sb << 3);
+  const int shift_uv = (col_in_sb >> 1) + ((row_in_sb >> 1) << 2);
+  const int build_uv = first_block_in_16x16[row_in_sb][col_in_sb];
+
+  if (!filter_level) {
+    return;
+  } else {
+    int index = shift_y;
+    int i;
+    for (i = 0; i < bh; i++) {
+      memset(&lfm->lfl_y[index], filter_level, bw);
+      index += 8;
+    }
+  }
+
+  // These set 1 in the current block size for the block size edges.
+  // For instance if the block size is 32x16, we'll set:
+  //    above =   1111
+  //              0000
+  //    and
+  //    left  =   1000
+  //          =   1000
+  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
+  //        1,  not 8...
+  //
+  // U and V set things on a 16 bit scale.
+  //
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+
+  if (build_uv) {
+    *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
+    *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
+  }
+
+  // If the block has no coefficients and is not intra we skip applying
+  // the loop filter on block edges.
+  if (mbmi->skip && is_inter_block(mbmi))
+    return;
+
+  // Add a mask for the transform size. The transform size mask is set to
+  // be correct for a 64x64 prediction block size. Mask to match the size of
+  // the block we are working on and then shift it into place.
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  if (build_uv) {
+    *above_uv |= (size_mask_uv[block_size] &
+                  above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+    *left_uv |= (size_mask_uv[block_size] &
+                 left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+  }
+
+  // Try to determine what to do with the internal 4x4 block boundaries.  These
+  // differ from the 4x4 boundaries on the outside edge of an 8x8 in that the
+  // internal ones can be skipped and don't depend on the prediction block size.
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= size_mask[block_size] << shift_y;
+
+  if (build_uv && tx_size_uv == TX_4X4)
+    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
 }
 
 void vp9_loop_filter_data_reset(
@@ -1608,9 +1733,17 @@ void vp9_loop_filter_data_reset(
   memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
 }
 
+void vp9_reset_lfm(VP9_COMMON *const cm) {
+  if (cm->lf.filter_level) {
+    memset(cm->lf.lfm, 0,
+           ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride *
+            sizeof(*cm->lf.lfm));
+  }
+}
+
 int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
   (void)unused;
-  vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                       lf_data->start, lf_data->stop, lf_data->y_only);
+  loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                   lf_data->start, lf_data->stop, lf_data->y_only);
   return 1;
 }
diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h
index f7cbde67..7f943ea0 100644
--- a/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libvpx/vp9/common/vp9_loopfilter.h
@@ -35,24 +35,6 @@ enum lf_path {
   LF_PATH_SLOW,
 };
 
-struct loopfilter {
-  int filter_level;
-
-  int sharpness_level;
-  int last_sharpness_level;
-
-  uint8_t mode_ref_delta_enabled;
-  uint8_t mode_ref_delta_update;
-
-  // 0 = Intra, Last, GF, ARF
-  signed char ref_deltas[MAX_REF_LF_DELTAS];
-  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
-
-  // 0 = ZERO_MV, MV
-  signed char mode_deltas[MAX_MODE_LF_DELTAS];
-  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
-};
-
 // Need to align this structure so when it is declared and
 // passed it can be loaded into vector registers.
 typedef struct {
@@ -83,9 +65,29 @@ typedef struct {
   uint16_t above_uv[TX_SIZES];
   uint16_t int_4x4_uv;
   uint8_t lfl_y[64];
-  uint8_t lfl_uv[16];
 } LOOP_FILTER_MASK;
 
+struct loopfilter {
+  int filter_level;
+
+  int sharpness_level;
+  int last_sharpness_level;
+
+  uint8_t mode_ref_delta_enabled;
+  uint8_t mode_ref_delta_update;
+
+  // 0 = Intra, Last, GF, ARF
+  signed char ref_deltas[MAX_REF_LF_DELTAS];
+  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
+
+  // 0 = ZERO_MV, MV
+  signed char mode_deltas[MAX_MODE_LF_DELTAS];
+  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
+
+  LOOP_FILTER_MASK *lfm;
+  int lfm_stride;
+};
+
 /* assorted loopfilter functions which get used elsewhere */
 struct VP9Common;
 struct macroblockd;
@@ -116,7 +118,7 @@ void vp9_filter_block_plane_non420(struct VP9Common *cm,
 void vp9_loop_filter_init(struct VP9Common *cm);
 
 // Update the loop filter for the current frame.
-// This should be called before vp9_loop_filter_rows(), vp9_loop_filter_frame()
+// This should be called before vp9_loop_filter_frame(), vp9_build_mask_frame()
 // calls this function directly.
 void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl);
 
@@ -126,11 +128,19 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
                            int filter_level,
                            int y_only, int partial_frame);
 
-// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
-void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
-                          struct VP9Common *cm,
-                          struct macroblockd_plane planes[MAX_MB_PLANE],
-                          int start, int stop, int y_only);
+// Get the superblock lfm for a given mi_row, mi_col.
+static INLINE LOOP_FILTER_MASK *get_lfm(const struct loopfilter *lf,
+                                        const int mi_row, const int mi_col) {
+  return &lf->lfm[(mi_col >> 3) + ((mi_row >> 3) * lf->lfm_stride)];
+}
+
+void vp9_build_mask(struct VP9Common *cm, const MB_MODE_INFO *mbmi, int mi_row,
+                    int mi_col, int bw, int bh);
+void vp9_adjust_mask(struct VP9Common *const cm, const int mi_row,
+                     const int mi_col, LOOP_FILTER_MASK *lfm);
+void vp9_build_mask_frame(struct VP9Common *cm, int frame_filter_level,
+                          int partial_frame);
+void vp9_reset_lfm(struct VP9Common *const cm);
 
 typedef struct LoopFilterWorkerData {
   YV12_BUFFER_CONFIG *frame_buffer;
diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h
index c373c027..ceffdedf 100644
--- a/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libvpx/vp9/common/vp9_onyxc_int.h
@@ -112,10 +112,11 @@ typedef struct BufferPool {
 typedef struct VP9Common {
   struct vpx_internal_error_info  error;
   vpx_color_space_t color_space;
+  vpx_color_range_t color_range;
   int width;
   int height;
-  int display_width;
-  int display_height;
+  int render_width;
+  int render_height;
   int last_width;
   int last_height;
 
@@ -357,13 +358,12 @@ static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd,
     xd->above_context[i] = cm->above_context +
         i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
 
-    if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
+    if (get_plane_type(i) == PLANE_TYPE_Y) {
       memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant));
     } else {
       memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant));
     }
     xd->fc = cm->fc;
-    xd->frame_parallel_decoding_mode = cm->frame_parallel_decoding_mode;
   }
 
   xd->above_seg_context = cm->above_seg_context;
diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c
index 71ab8615..b685d813 100644
--- a/libvpx/vp9/common/vp9_postproc.c
+++ b/libvpx/vp9/common/vp9_postproc.c
@@ -16,6 +16,7 @@
 #include "./vpx_scale_rtcd.h"
 #include "./vp9_rtcd.h"
 
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
 #include "vpx_scale/vpx_scale.h"
@@ -625,7 +626,7 @@ static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
 
 int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
-  const int q = MIN(105, cm->lf.filter_level * 2);
+  const int q = VPXMIN(105, cm->lf.filter_level * 2);
   const int flags = ppflags->post_proc_flag;
   YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
   struct postproc_state *const ppstate = &cm->postproc_state;
diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h
index 67b95dbc..6f7af4a5 100644
--- a/libvpx/vp9/common/vp9_pred_common.h
+++ b/libvpx/vp9/common/vp9_pred_common.h
@@ -13,6 +13,7 @@
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -24,14 +25,14 @@ static INLINE int get_segment_id(const VP9_COMMON *cm,
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int bw = num_8x8_blocks_wide_lookup[bsize];
   const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
   int x, y, segment_id = MAX_SEGMENTS;
 
   for (y = 0; y < ymis; ++y)
     for (x = 0; x < xmis; ++x)
-      segment_id = MIN(segment_id,
-                       segment_ids[mi_offset + y * cm->mi_cols + x]);
+      segment_id =
+          VPXMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
 
   assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
   return segment_id;
diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c
index f83f8257..d8c14ecc 100644
--- a/libvpx/vp9/common/vp9_reconinter.c
+++ b/libvpx/vp9/common/vp9_reconinter.c
@@ -187,7 +187,19 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     const int is_scaled = vp9_is_scaled(sf);
 
     if (is_scaled) {
-      pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+      // Co-ordinate of containing block to pixel precision.
+      const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+      const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+      if (plane == 0)
+        pre_buf->buf = xd->block_refs[ref]->buf->y_buffer;
+      else if (plane == 1)
+        pre_buf->buf = xd->block_refs[ref]->buf->u_buffer;
+      else
+        pre_buf->buf = xd->block_refs[ref]->buf->v_buffer;
+
+      pre_buf->buf += scaled_buffer_offset(x_start + x, y_start + y,
+                                           pre_buf->stride, sf);
+      pre = pre_buf->buf;
       scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
       xs = sf->x_step_q4;
       ys = sf->y_step_q4;
diff --git a/libvpx/vp9/common/vp9_reconintra.c b/libvpx/vp9/common/vp9_reconintra.c
index e60eff80..3d84a288 100644
--- a/libvpx/vp9/common/vp9_reconintra.c
+++ b/libvpx/vp9/common/vp9_reconintra.c
@@ -133,7 +133,6 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
   int frame_width, frame_height;
   int x0, y0;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  //  int base=128;
   int base = 128 << (bd - 8);
   // 127 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.pl b/libvpx/vp9/common/vp9_rtcd_defs.pl
index 737fc56d..5bf71ef9 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -85,16 +85,26 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
 # dct
 #
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  # Note as optimized versions of these functions are added we need to add a check to ensure
-  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-  specialize qw/vp9_iht4x4_16_add/;
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht4x4_16_add/;
+
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht8x8_64_add/;
+
+    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp9_iht16x16_256_add/;
+  } else {
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht4x4_16_add sse2/;
 
-  add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-  specialize qw/vp9_iht8x8_64_add/;
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht8x8_64_add sse2/;
 
-  add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-  specialize qw/vp9_iht16x16_256_add/;
+    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp9_iht16x16_256_add sse2/;
+  }
 } else {
   # Force C versions if CONFIG_EMULATE_HARDWARE is 1
   if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
@@ -231,11 +241,15 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
 }
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-# the transform coefficients are held in 32-bit
-# values, so the assembler code for  vp9_block_error can no longer be used.
   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_block_error/;
 
+  add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+  specialize qw/vp9_highbd_block_error/, "$sse2_x86inc";
+
+  add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc", "$avx_x86inc";
+
   add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp/;
 
@@ -310,9 +324,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   # ENCODEMB INVOKE
 
-  add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/vp9_highbd_block_error sse2/;
-
   add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_fp/;
 
diff --git a/libvpx/vp9/common/vp9_thread_common.c b/libvpx/vp9/common/vp9_thread_common.c
index 6b11c93c..db78d6be 100644
--- a/libvpx/vp9/common/vp9_thread_common.c
+++ b/libvpx/vp9/common/vp9_thread_common.c
@@ -9,6 +9,7 @@
  */
 
 #include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_thread_common.h"
@@ -108,29 +109,27 @@ void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,
   for (mi_row = start; mi_row < stop;
        mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
     MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0);
 
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) {
       const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
       const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
-      LOOP_FILTER_MASK lfm;
       int plane;
 
       sync_read(lf_sync, r, c);
 
       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
-      // TODO(JBB): Make setup_mask work for non 420.
-      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
-                     &lfm);
+      vp9_adjust_mask(cm, mi_row, mi_col, lfm);
 
-      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm);
       for (plane = 1; plane < num_planes; ++plane) {
         switch (path) {
           case LF_PATH_420:
-            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm);
             break;
           case LF_PATH_444:
-            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm);
             break;
           case LF_PATH_SLOW:
             vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
@@ -165,7 +164,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
   // Decoder may allocate more threads than number of tiles based on user's
   // input.
   const int tile_cols = 1 << cm->log2_tile_cols;
-  const int num_workers = MIN(nworkers, tile_cols);
+  const int num_workers = VPXMIN(nworkers, tile_cols);
   int i;
 
   if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
@@ -229,7 +228,7 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
   if (partial_frame && cm->mi_rows > 8) {
     start_mi_row = cm->mi_rows >> 1;
     start_mi_row &= 0xfffffff8;
-    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
+    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
   }
   end_mi_row = start_mi_row + mi_rows_to_filter;
   vp9_loop_filter_frame_init(cm, frame_filter_level);
@@ -317,21 +316,21 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
 }
 
 // Accumulate frame counts.
-void vp9_accumulate_frame_counts(VP9_COMMON *cm, FRAME_COUNTS *counts,
-                                 int is_dec) {
+void vp9_accumulate_frame_counts(FRAME_COUNTS *accum,
+                                 const FRAME_COUNTS *counts, int is_dec) {
   int i, j, k, l, m;
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
     for (j = 0; j < INTRA_MODES; j++)
-      cm->counts.y_mode[i][j] += counts->y_mode[i][j];
+      accum->y_mode[i][j] += counts->y_mode[i][j];
 
   for (i = 0; i < INTRA_MODES; i++)
     for (j = 0; j < INTRA_MODES; j++)
-      cm->counts.uv_mode[i][j] += counts->uv_mode[i][j];
+      accum->uv_mode[i][j] += counts->uv_mode[i][j];
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
     for (j = 0; j < PARTITION_TYPES; j++)
-      cm->counts.partition[i][j] += counts->partition[i][j];
+      accum->partition[i][j] += counts->partition[i][j];
 
   if (is_dec) {
     int n;
@@ -340,10 +339,10 @@ void vp9_accumulate_frame_counts(VP9_COMMON *cm, FRAME_COUNTS *counts,
         for (k = 0; k < REF_TYPES; k++)
           for (l = 0; l < COEF_BANDS; l++)
             for (m = 0; m < COEFF_CONTEXTS; m++) {
-              cm->counts.eob_branch[i][j][k][l][m] +=
+              accum->eob_branch[i][j][k][l][m] +=
                   counts->eob_branch[i][j][k][l][m];
               for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
-                cm->counts.coef[i][j][k][l][m][n] +=
+                accum->coef[i][j][k][l][m][n] +=
                     counts->coef[i][j][k][l][m][n];
             }
   } else {
@@ -352,64 +351,64 @@ void vp9_accumulate_frame_counts(VP9_COMMON *cm, FRAME_COUNTS *counts,
         for (k = 0; k < REF_TYPES; k++)
           for (l = 0; l < COEF_BANDS; l++)
             for (m = 0; m < COEFF_CONTEXTS; m++)
-              cm->counts.eob_branch[i][j][k][l][m] +=
+              accum->eob_branch[i][j][k][l][m] +=
                   counts->eob_branch[i][j][k][l][m];
-                // In the encoder, cm->counts.coef is only updated at frame
+                // In the encoder, coef is only updated at frame
                 // level, so not need to accumulate it here.
                 // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
-                //   cm->counts.coef[i][j][k][l][m][n] +=
+                //   accum->coef[i][j][k][l][m][n] +=
                 //       counts->coef[i][j][k][l][m][n];
   }
 
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
     for (j = 0; j < SWITCHABLE_FILTERS; j++)
-      cm->counts.switchable_interp[i][j] += counts->switchable_interp[i][j];
+      accum->switchable_interp[i][j] += counts->switchable_interp[i][j];
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
     for (j = 0; j < INTER_MODES; j++)
-      cm->counts.inter_mode[i][j] += counts->inter_mode[i][j];
+      accum->inter_mode[i][j] += counts->inter_mode[i][j];
 
   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      cm->counts.intra_inter[i][j] += counts->intra_inter[i][j];
+      accum->intra_inter[i][j] += counts->intra_inter[i][j];
 
   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      cm->counts.comp_inter[i][j] += counts->comp_inter[i][j];
+      accum->comp_inter[i][j] += counts->comp_inter[i][j];
 
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
       for (k = 0; k < 2; k++)
-      cm->counts.single_ref[i][j][k] += counts->single_ref[i][j][k];
+      accum->single_ref[i][j][k] += counts->single_ref[i][j][k];
 
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      cm->counts.comp_ref[i][j] += counts->comp_ref[i][j];
+      accum->comp_ref[i][j] += counts->comp_ref[i][j];
 
   for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
     for (j = 0; j < TX_SIZES; j++)
-      cm->counts.tx.p32x32[i][j] += counts->tx.p32x32[i][j];
+      accum->tx.p32x32[i][j] += counts->tx.p32x32[i][j];
 
     for (j = 0; j < TX_SIZES - 1; j++)
-      cm->counts.tx.p16x16[i][j] += counts->tx.p16x16[i][j];
+      accum->tx.p16x16[i][j] += counts->tx.p16x16[i][j];
 
     for (j = 0; j < TX_SIZES - 2; j++)
-      cm->counts.tx.p8x8[i][j] += counts->tx.p8x8[i][j];
+      accum->tx.p8x8[i][j] += counts->tx.p8x8[i][j];
   }
 
   for (i = 0; i < TX_SIZES; i++)
-    cm->counts.tx.tx_totals[i] += counts->tx.tx_totals[i];
+    accum->tx.tx_totals[i] += counts->tx.tx_totals[i];
 
   for (i = 0; i < SKIP_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      cm->counts.skip[i][j] += counts->skip[i][j];
+      accum->skip[i][j] += counts->skip[i][j];
 
   for (i = 0; i < MV_JOINTS; i++)
-    cm->counts.mv.joints[i] += counts->mv.joints[i];
+    accum->mv.joints[i] += counts->mv.joints[i];
 
   for (k = 0; k < 2; k++) {
-    nmv_component_counts *comps = &cm->counts.mv.comps[k];
-    nmv_component_counts *comps_t = &counts->mv.comps[k];
+    nmv_component_counts *const comps = &accum->mv.comps[k];
+    const nmv_component_counts *const comps_t = &counts->mv.comps[k];
 
     for (i = 0; i < 2; i++) {
       comps->sign[i] += comps_t->sign[i];
diff --git a/libvpx/vp9/common/vp9_thread_common.h b/libvpx/vp9/common/vp9_thread_common.h
index 07af1bc4..b3b60c25 100644
--- a/libvpx/vp9/common/vp9_thread_common.h
+++ b/libvpx/vp9/common/vp9_thread_common.h
@@ -8,12 +8,16 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
-#define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
+#ifndef VP9_COMMON_VP9_THREAD_COMMON_H_
+#define VP9_COMMON_VP9_THREAD_COMMON_H_
 #include "./vpx_config.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vpx_util/vpx_thread.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct VP9Common;
 struct FRAME_COUNTS;
 
@@ -51,7 +55,11 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                               VPxWorker *workers, int num_workers,
                               VP9LfSync *lf_sync);
 
-void vp9_accumulate_frame_counts(struct VP9Common *cm,
-                                 struct FRAME_COUNTS *counts, int is_dec);
+void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum,
+                                 const struct FRAME_COUNTS *counts, int is_dec);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
-#endif  // VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
+#endif  // VP9_COMMON_VP9_THREAD_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_tile_common.c b/libvpx/vp9/common/vp9_tile_common.c
index 7a20e0a9..9fcb97c8 100644
--- a/libvpx/vp9/common/vp9_tile_common.c
+++ b/libvpx/vp9/common/vp9_tile_common.c
@@ -9,8 +9,8 @@
  */
 
 #include "vp9/common/vp9_tile_common.h"
-
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 #define MIN_TILE_WIDTH_B64 4
 #define MAX_TILE_WIDTH_B64 64
@@ -18,7 +18,7 @@
 static int get_tile_offset(int idx, int mis, int log2) {
   const int sb_cols = mi_cols_aligned_to_sb(mis) >> MI_BLOCK_SIZE_LOG2;
   const int offset = ((idx * sb_cols) >> log2) << MI_BLOCK_SIZE_LOG2;
-  return MIN(offset, mis);
+  return VPXMIN(offset, mis);
 }
 
 void vp9_tile_set_row(TileInfo *tile, const VP9_COMMON *cm, int row) {
diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index 4a163457..8d312d03 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -12,14 +12,14 @@
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
 
-void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   __m128i in[2];
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0] = _mm_loadu_si128((const __m128i *)(input));
-  in[1] = _mm_loadu_si128((const __m128i *)(input + 8));
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8);
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -77,21 +77,21 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
   }
 }
 
-void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   __m128i in[8];
   const __m128i zero = _mm_setzero_si128();
   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
 
   // load input data
-  in[0] = _mm_load_si128((const __m128i *)input);
-  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
-  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
-  in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
-  in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8 * 1);
+  in[2] = load_input_data(input + 8 * 2);
+  in[3] = load_input_data(input + 8 * 3);
+  in[4] = load_input_data(input + 8 * 4);
+  in[5] = load_input_data(input + 8 * 5);
+  in[6] = load_input_data(input + 8 * 6);
+  in[7] = load_input_data(input + 8 * 7);
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -144,8 +144,8 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
   RECON_AND_STORE(dest + 7 * stride, in[7]);
 }
 
-void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
-                               int tx_type) {
+void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
   __m128i in0[16], in1[16];
 
   load_buffer_8x16(input, in0);
diff --git a/libvpx/vp9/decoder/vp9_decodeframe.c b/libvpx/vp9/decoder/vp9_decodeframe.c
index fb7b3b80..f1916639 100644
--- a/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -17,6 +17,7 @@
 
 #include "vpx_dsp/bitreader_buffer.h"
 #include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/mem_ops.h"
@@ -658,7 +659,7 @@ static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
     // pixels of each superblock row can be changed by next superblock row.
     if (pbi->frame_parallel_decode)
       vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
-                           MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+                           VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
 
     // Skip border extension if block is inside the frame.
     if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
@@ -686,7 +687,7 @@ static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
      if (pbi->frame_parallel_decode) {
        const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
        vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
-                            MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+                            VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
      }
   }
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -757,8 +758,8 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
 static INLINE TX_SIZE dec_get_uv_tx_size(const MB_MODE_INFO *mbmi,
                                          int n4_wl, int n4_hl) {
   // get minimum log2 num4x4s dimension
-  const int x = MIN(n4_wl, n4_hl);
-  return MIN(mbmi->tx_size,  x);
+  const int x = VPXMIN(n4_wl, n4_hl);
+  return VPXMIN(mbmi->tx_size,  x);
 }
 
 static INLINE void dec_reset_skip_context(MACROBLOCKD *xd) {
@@ -819,8 +820,8 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
   const int less8x8 = bsize < BLOCK_8X8;
   const int bw = 1 << (bwl - 1);
   const int bh = 1 << (bhl - 1);
-  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
 
   MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
                                    bw, bh, x_mis, y_mis, bwl, bhl);
@@ -895,6 +896,10 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
   }
 
   xd->corrupted |= vpx_reader_has_error(r);
+
+  if (cm->lf.filter_level) {
+    vp9_build_mask(cm, mbmi, mi_row, mi_col, bw, bh);
+  }
 }
 
 static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd,
@@ -1180,11 +1185,11 @@ static INTERP_FILTER read_interp_filter(struct vpx_read_bit_buffer *rb) {
                              : literal_to_filter[vpx_rb_read_literal(rb, 2)];
 }
 
-static void setup_display_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
-  cm->display_width = cm->width;
-  cm->display_height = cm->height;
+static void setup_render_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
+  cm->render_width = cm->width;
+  cm->render_height = cm->height;
   if (vpx_rb_read_bit(rb))
-    vp9_read_frame_size(rb, &cm->display_width, &cm->display_height);
+    vp9_read_frame_size(rb, &cm->render_width, &cm->render_height);
 }
 
 static void resize_mv_buffer(VP9_COMMON *cm) {
@@ -1232,7 +1237,7 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   BufferPool *const pool = cm->buffer_pool;
   vp9_read_frame_size(rb, &width, &height);
   resize_context_buffers(cm, width, height);
-  setup_display_size(cm, rb);
+  setup_render_size(cm, rb);
 
   lock_buffer_pool(pool);
   if (vpx_realloc_frame_buffer(
@@ -1255,6 +1260,9 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
   pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_width  = cm->render_width;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
 
 static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth,
@@ -1313,7 +1321,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
   }
 
   resize_context_buffers(cm, width, height);
-  setup_display_size(cm, rb);
+  setup_render_size(cm, rb);
 
   lock_buffer_pool(pool);
   if (vpx_realloc_frame_buffer(
@@ -1336,6 +1344,9 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
   pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_width  = cm->render_width;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
 
 static void setup_tile_info(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
@@ -1358,12 +1369,6 @@ static void setup_tile_info(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
     cm->log2_tile_rows += vpx_rb_read_bit(rb);
 }
 
-typedef struct TileBuffer {
-  const uint8_t *data;
-  size_t size;
-  int col;  // only used with multi-threaded decoding
-} TileBuffer;
-
 // Reads the next tile returning its size and adjusting '*data' accordingly
 // based on 'is_last'.
 static void get_tile_buffer(const uint8_t *const data_end,
@@ -1461,6 +1466,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
   memset(cm->above_seg_context, 0,
          sizeof(*cm->above_seg_context) * aligned_cols);
 
+  vp9_reset_lfm(cm);
+
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
 
   if (pbi->tile_data == NULL ||
@@ -1560,30 +1567,54 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
   return vpx_reader_find_end(&tile_data->bit_reader);
 }
 
+// On entry 'tile_data->data_end' points to the end of the input frame, on exit
+// it is updated to reflect the bitreader position of the final tile column if
+// present in the tile buffer group or NULL otherwise.
 static int tile_worker_hook(TileWorkerData *const tile_data,
-                            const TileInfo *const tile) {
-  int mi_row, mi_col;
+                            VP9Decoder *const pbi) {
+  TileInfo *volatile tile = &tile_data->xd.tile;
+  const int final_col = (1 << pbi->common.log2_tile_cols) - 1;
+  const uint8_t *volatile bit_reader_end = NULL;
+  volatile int n = tile_data->buf_start;
+  tile_data->error_info.setjmp = 1;
 
   if (setjmp(tile_data->error_info.jmp)) {
     tile_data->error_info.setjmp = 0;
     tile_data->xd.corrupted = 1;
+    tile_data->data_end = NULL;
     return 0;
   }
 
-  tile_data->error_info.setjmp = 1;
   tile_data->xd.error_info = &tile_data->error_info;
+  tile_data->xd.corrupted = 0;
 
-  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
-       mi_row += MI_BLOCK_SIZE) {
-    vp9_zero(tile_data->xd.left_context);
-    vp9_zero(tile_data->xd.left_seg_context);
-    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MI_BLOCK_SIZE) {
-      decode_partition(tile_data->pbi, &tile_data->xd,
-                       mi_row, mi_col, &tile_data->bit_reader,
-                       BLOCK_64X64, 4);
+  do {
+    int mi_row, mi_col;
+    const TileBuffer *const buf = pbi->tile_buffers + n;
+    vp9_zero(tile_data->dqcoeff);
+    vp9_tile_init(tile, &pbi->common, 0, buf->col);
+    setup_token_decoder(buf->data, tile_data->data_end, buf->size,
+                        &tile_data->error_info, &tile_data->bit_reader,
+                        pbi->decrypt_cb, pbi->decrypt_state);
+    vp9_init_macroblockd(&pbi->common, &tile_data->xd, tile_data->dqcoeff);
+
+    for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+         mi_row += MI_BLOCK_SIZE) {
+      vp9_zero(tile_data->xd.left_context);
+      vp9_zero(tile_data->xd.left_seg_context);
+      for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+           mi_col += MI_BLOCK_SIZE) {
+        decode_partition(pbi, &tile_data->xd, mi_row, mi_col,
+                         &tile_data->bit_reader, BLOCK_64X64, 4);
+      }
     }
-  }
+
+    if (buf->col == final_col) {
+      bit_reader_end = vpx_reader_find_end(&tile_data->bit_reader);
+    }
+  } while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end);
+
+  tile_data->data_end = bit_reader_end;
   return !tile_data->xd.corrupted;
 }
 
@@ -1603,20 +1634,15 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
-  const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
-  TileBuffer tile_buffers[1][1 << 6];
+  const int num_workers = VPXMIN(pbi->max_threads, tile_cols);
   int n;
-  int final_worker = -1;
 
   assert(tile_cols <= (1 << 6));
   assert(tile_rows == 1);
   (void)tile_rows;
 
-  // TODO(jzern): See if we can remove the restriction of passing in max
-  // threads to the decoder.
   if (pbi->num_tile_workers == 0) {
-    const int num_threads = pbi->max_threads & ~1;
-    int i;
+    const int num_threads = pbi->max_threads;
     CHECK_MEM_ERROR(cm, pbi->tile_workers,
                     vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
     // Ensure tile data offsets will be properly aligned. This may fail on
@@ -1625,14 +1651,12 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
     CHECK_MEM_ERROR(cm, pbi->tile_worker_data,
                     vpx_memalign(32, num_threads *
                                  sizeof(*pbi->tile_worker_data)));
-    CHECK_MEM_ERROR(cm, pbi->tile_worker_info,
-                    vpx_malloc(num_threads * sizeof(*pbi->tile_worker_info)));
-    for (i = 0; i < num_threads; ++i) {
-      VPxWorker *const worker = &pbi->tile_workers[i];
+    for (n = 0; n < num_threads; ++n) {
+      VPxWorker *const worker = &pbi->tile_workers[n];
       ++pbi->num_tile_workers;
 
       winterface->init(worker);
-      if (i < num_threads - 1 && !winterface->reset(worker)) {
+      if (n < num_threads - 1 && !winterface->reset(worker)) {
         vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                            "Tile decoder thread creation failed");
       }
@@ -1642,10 +1666,14 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
   // Reset tile decoding hook
   for (n = 0; n < num_workers; ++n) {
     VPxWorker *const worker = &pbi->tile_workers[n];
+    TileWorkerData *const tile_data = &pbi->tile_worker_data[n];
     winterface->sync(worker);
+    tile_data->xd = pbi->mb;
+    tile_data->xd.counts =
+        cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;
     worker->hook = (VPxWorkerHook)tile_worker_hook;
-    worker->data1 = &pbi->tile_worker_data[n];
-    worker->data2 = &pbi->tile_worker_info[n];
+    worker->data1 = tile_data;
+    worker->data2 = pbi;
   }
 
   // Note: this memset assumes above_context[0], [1] and [2]
@@ -1655,101 +1683,95 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
   memset(cm->above_seg_context, 0,
          sizeof(*cm->above_seg_context) * aligned_mi_cols);
 
+  vp9_reset_lfm(cm);
+
   // Load tile data into tile_buffers
-  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
+  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows,
+                   &pbi->tile_buffers);
 
   // Sort the buffers based on size in descending order.
-  qsort(tile_buffers[0], tile_cols, sizeof(tile_buffers[0][0]),
+  qsort(pbi->tile_buffers, tile_cols, sizeof(pbi->tile_buffers[0]),
         compare_tile_buffers);
 
-  // Rearrange the tile buffers such that per-tile group the largest, and
-  // presumably the most difficult, tile will be decoded in the main thread.
-  // This should help minimize the number of instances where the main thread is
-  // waiting for a worker to complete.
-  {
-    int group_start = 0;
-    while (group_start < tile_cols) {
-      const TileBuffer largest = tile_buffers[0][group_start];
-      const int group_end = MIN(group_start + num_workers, tile_cols) - 1;
-      memmove(tile_buffers[0] + group_start, tile_buffers[0] + group_start + 1,
-              (group_end - group_start) * sizeof(tile_buffers[0][0]));
-      tile_buffers[0][group_end] = largest;
-      group_start = group_end + 1;
+  if (num_workers == tile_cols) {
+    // Rearrange the tile buffers such that the largest, and
+    // presumably the most difficult, tile will be decoded in the main thread.
+    // This should help minimize the number of instances where the main thread
+    // is waiting for a worker to complete.
+    const TileBuffer largest = pbi->tile_buffers[0];
+    memmove(pbi->tile_buffers, pbi->tile_buffers + 1,
+            (tile_cols - 1) * sizeof(pbi->tile_buffers[0]));
+    pbi->tile_buffers[tile_cols - 1] = largest;
+  } else {
+    int start = 0, end = tile_cols - 2;
+    TileBuffer tmp;
+
+    // Interleave the tiles to distribute the load between threads, assuming a
+    // larger tile implies it is more difficult to decode.
+    while (start < end) {
+      tmp = pbi->tile_buffers[start];
+      pbi->tile_buffers[start] = pbi->tile_buffers[end];
+      pbi->tile_buffers[end] = tmp;
+      start += 2;
+      end -= 2;
     }
   }
 
   // Initialize thread frame counts.
   if (!cm->frame_parallel_decoding_mode) {
-    int i;
-
-    for (i = 0; i < num_workers; ++i) {
+    for (n = 0; n < num_workers; ++n) {
       TileWorkerData *const tile_data =
-          (TileWorkerData*)pbi->tile_workers[i].data1;
+          (TileWorkerData*)pbi->tile_workers[n].data1;
       vp9_zero(tile_data->counts);
     }
   }
 
-  n = 0;
-  while (n < tile_cols) {
-    int i;
-    for (i = 0; i < num_workers && n < tile_cols; ++i) {
-      VPxWorker *const worker = &pbi->tile_workers[i];
+  {
+    const int base = tile_cols / num_workers;
+    const int remain = tile_cols % num_workers;
+    int buf_start = 0;
+
+    for (n = 0; n < num_workers; ++n) {
+      const int count = base + (remain + n) / num_workers;
+      VPxWorker *const worker = &pbi->tile_workers[n];
       TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
-      TileInfo *const tile = (TileInfo*)worker->data2;
-      TileBuffer *const buf = &tile_buffers[0][n];
 
-      tile_data->pbi = pbi;
-      tile_data->xd = pbi->mb;
-      tile_data->xd.corrupted = 0;
-      tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
-                             0 : &tile_data->counts;
-      vp9_zero(tile_data->dqcoeff);
-      vp9_tile_init(tile, cm, 0, buf->col);
-      vp9_tile_init(&tile_data->xd.tile, cm, 0, buf->col);
-      setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
-                          &tile_data->bit_reader, pbi->decrypt_cb,
-                          pbi->decrypt_state);
-      vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
+      tile_data->buf_start = buf_start;
+      tile_data->buf_end = buf_start + count - 1;
+      tile_data->data_end = data_end;
+      buf_start += count;
 
       worker->had_error = 0;
-      if (i == num_workers - 1 || n == tile_cols - 1) {
+      if (n == num_workers - 1) {
+        assert(tile_data->buf_end == tile_cols - 1);
         winterface->execute(worker);
       } else {
         winterface->launch(worker);
       }
-
-      if (buf->col == tile_cols - 1) {
-        final_worker = i;
-      }
-
-      ++n;
     }
 
-    for (; i > 0; --i) {
-      VPxWorker *const worker = &pbi->tile_workers[i - 1];
+    for (; n > 0; --n) {
+      VPxWorker *const worker = &pbi->tile_workers[n - 1];
+      TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
       // TODO(jzern): The tile may have specific error data associated with
       // its vpx_internal_error_info which could be propagated to the main info
       // in cm. Additionally once the threads have been synced and an error is
       // detected, there's no point in continuing to decode tiles.
       pbi->mb.corrupted |= !winterface->sync(worker);
+      if (!bit_reader_end) bit_reader_end = tile_data->data_end;
     }
-    if (final_worker > -1) {
-      TileWorkerData *const tile_data =
-          (TileWorkerData*)pbi->tile_workers[final_worker].data1;
-      bit_reader_end = vpx_reader_find_end(&tile_data->bit_reader);
-      final_worker = -1;
-    }
+  }
 
-    // Accumulate thread frame counts.
-    if (n >= tile_cols && !cm->frame_parallel_decoding_mode) {
-      for (i = 0; i < num_workers; ++i) {
-        TileWorkerData *const tile_data =
-            (TileWorkerData*)pbi->tile_workers[i].data1;
-        vp9_accumulate_frame_counts(cm, &tile_data->counts, 1);
-      }
+  // Accumulate thread frame counts.
+  if (!cm->frame_parallel_decoding_mode) {
+    for (n = 0; n < num_workers; ++n) {
+      TileWorkerData *const tile_data =
+          (TileWorkerData*)pbi->tile_workers[n].data1;
+      vp9_accumulate_frame_counts(&cm->counts, &tile_data->counts, 1);
     }
   }
 
+  assert(bit_reader_end || pbi->mb.corrupted);
   return bit_reader_end;
 }
 
@@ -1773,7 +1795,7 @@ static void read_bitdepth_colorspace_sampling(
   }
   cm->color_space = vpx_rb_read_literal(rb, 3);
   if (cm->color_space != VPX_CS_SRGB) {
-    vpx_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
+    cm->color_range = (vpx_color_range_t)vpx_rb_read_bit(rb);
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
       cm->subsampling_x = vpx_rb_read_bit(rb);
       cm->subsampling_y = vpx_rb_read_bit(rb);
@@ -1787,6 +1809,7 @@ static void read_bitdepth_colorspace_sampling(
       cm->subsampling_y = cm->subsampling_x = 1;
     }
   } else {
+    cm->color_range = VPX_CR_FULL_RANGE;
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
       // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed.
       // 4:2:2 or 4:4:0 chroma sampling is not allowed.
@@ -1892,6 +1915,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
         // specifies that the default color format should be YUV 4:2:0 in this
         // case (normative).
         cm->color_space = VPX_CS_BT_601;
+        cm->color_range = VPX_CR_STUDIO_RANGE;
         cm->subsampling_y = cm->subsampling_x = 1;
         cm->bit_depth = VPX_BITS_8;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1942,6 +1966,9 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
   get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
 #endif
   get_frame_new_buffer(cm)->color_space = cm->color_space;
+  get_frame_new_buffer(cm)->color_range = cm->color_range;
+  get_frame_new_buffer(cm)->render_width  = cm->render_width;
+  get_frame_new_buffer(cm)->render_height = cm->render_height;
 
   if (pbi->need_resync) {
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
@@ -2102,7 +2129,7 @@ static struct vpx_read_bit_buffer *init_read_bit_buffer(
   rb->error_handler = error_handler;
   rb->error_handler_data = &pbi->common;
   if (pbi->decrypt_cb) {
-    const int n = (int)MIN(MAX_VP9_HEADER_SIZE, data_end - data);
+    const int n = (int)VPXMIN(MAX_VP9_HEADER_SIZE, data_end - data);
     pbi->decrypt_cb(pbi->decrypt_state, data, clear_data, n);
     rb->bit_buffer = clear_data;
     rb->bit_buffer_end = clear_data + n;
diff --git a/libvpx/vp9/decoder/vp9_decodeframe.h b/libvpx/vp9/decoder/vp9_decodeframe.h
index 05af7063..ce33cbdb 100644
--- a/libvpx/vp9/decoder/vp9_decodeframe.h
+++ b/libvpx/vp9/decoder/vp9_decodeframe.h
@@ -16,6 +16,8 @@
 extern "C" {
 #endif
 
+#include "vp9/common/vp9_enums.h"
+
 struct VP9Decoder;
 struct vpx_read_bit_buffer;
 
diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c
index 33818a99..d3ca7b3f 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libvpx/vp9/decoder/vp9_decodemv.c
@@ -22,6 +22,8 @@
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decodeframe.h"
 
+#include "vpx_dsp/vpx_dsp_common.h"
+
 static PREDICTION_MODE read_intra_mode(vpx_reader *r, const vpx_prob *p) {
   return (PREDICTION_MODE)vpx_read_tree(r, vp9_intra_mode_tree, p);
 }
@@ -87,7 +89,7 @@ static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
   if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
     return read_selected_tx_size(cm, xd, max_tx_size, r);
   else
-    return MIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
+    return VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
 }
 
 static int dec_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
@@ -96,8 +98,8 @@ static int dec_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
 
   for (y = 0; y < y_mis; y++)
     for (x = 0; x < x_mis; x++)
-      segment_id = MIN(segment_id,
-                       segment_ids[mi_offset + y * cm->mi_cols + x]);
+      segment_id =
+          VPXMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
 
   assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
   return segment_id;
@@ -156,8 +158,8 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   const int bh = xd->plane[0].n4_h >> 1;
 
   // TODO(slavarnway): move x_mis, y_mis into xd ?????
-  const int x_mis = MIN(cm->mi_cols - mi_col, bw);
-  const int y_mis = MIN(cm->mi_rows - mi_row, bh);
+  const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);
 
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
@@ -212,8 +214,8 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
   const int bh = xd->plane[0].n4_h >> 1;
 
   // TODO(slavarnway): move x_mis, y_mis into xd ?????
-  const int x_mis = MIN(cm->mi_cols - mi_col, bw);
-  const int y_mis = MIN(cm->mi_rows - mi_row, bh);
+  const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);
 
   mbmi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r);
   mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
diff --git a/libvpx/vp9/decoder/vp9_decoder.c b/libvpx/vp9/decoder/vp9_decoder.c
index 6734d002..4e88819b 100644
--- a/libvpx/vp9/decoder/vp9_decoder.c
+++ b/libvpx/vp9/decoder/vp9_decoder.c
@@ -126,6 +126,9 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
 void vp9_decoder_remove(VP9Decoder *pbi) {
   int i;
 
+  if (!pbi)
+    return;
+
   vpx_get_worker_interface()->end(&pbi->lf_worker);
   vpx_free(pbi->lf_worker.data1);
   vpx_free(pbi->tile_data);
@@ -134,7 +137,6 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
     vpx_get_worker_interface()->end(worker);
   }
   vpx_free(pbi->tile_worker_data);
-  vpx_free(pbi->tile_worker_info);
   vpx_free(pbi->tile_workers);
 
   if (pbi->num_tile_workers > 0) {
diff --git a/libvpx/vp9/decoder/vp9_decoder.h b/libvpx/vp9/decoder/vp9_decoder.h
index 915f9dc8..4a5188f8 100644
--- a/libvpx/vp9/decoder/vp9_decoder.h
+++ b/libvpx/vp9/decoder/vp9_decoder.h
@@ -36,8 +36,15 @@ typedef struct TileData {
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
 } TileData;
 
+typedef struct TileBuffer {
+  const uint8_t *data;
+  size_t size;
+  int col;  // only used with multi-threaded decoding
+} TileBuffer;
+
 typedef struct TileWorkerData {
-  struct VP9Decoder *pbi;
+  const uint8_t *data_end;
+  int buf_start, buf_end;  // pbi->tile_buffers to decode, inclusive
   vpx_reader bit_reader;
   FRAME_COUNTS counts;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
@@ -65,7 +72,7 @@ typedef struct VP9Decoder {
   VPxWorker lf_worker;
   VPxWorker *tile_workers;
   TileWorkerData *tile_worker_data;
-  TileInfo *tile_worker_info;
+  TileBuffer tile_buffers[64];
   int num_tile_workers;
 
   TileData *tile_data;
diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c
index e4412dc3..59123653 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/libvpx/vp9/decoder/vp9_detokenize.c
@@ -259,7 +259,7 @@ int vp9_decode_block_tokens(MACROBLOCKD *xd,
   const int16_t *const dequant = pd->seg_dequant[seg_id];
   const int ctx = get_entropy_context(tx_size, pd->above_context + x,
                                                pd->left_context + y);
-  const int eob = decode_coefs(xd, pd->plane_type,
+  const int eob = decode_coefs(xd, get_plane_type(plane),
                                pd->dqcoeff, tx_size,
                                dequant, ctx, sc->scan, sc->neighbors, r);
   dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
diff --git a/libvpx/vp9/decoder/vp9_dthread.h b/libvpx/vp9/decoder/vp9_dthread.h
index f6cdccd9..ba7c38a5 100644
--- a/libvpx/vp9/decoder/vp9_dthread.h
+++ b/libvpx/vp9/decoder/vp9_dthread.h
@@ -15,6 +15,10 @@
 #include "vpx_util/vpx_thread.h"
 #include "vpx/internal/vpx_codec_internal.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct VP9Common;
 struct VP9Decoder;
 
@@ -63,4 +67,8 @@ void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row);
 void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
                                   VPxWorker *const src_worker);
 
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
 #endif  // VP9_DECODER_VP9_DTHREAD_H_
diff --git a/libvpx/vp9/encoder/vp9_aq_complexity.c b/libvpx/vp9/encoder/vp9_aq_complexity.c
index 15f227fb..30ec1911 100644
--- a/libvpx/vp9/encoder/vp9_aq_complexity.c
+++ b/libvpx/vp9/encoder/vp9_aq_complexity.c
@@ -10,6 +10,7 @@
 
 #include <limits.h>
 #include <math.h>
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/system_state.h"
 
 #include "vp9/encoder/vp9_aq_complexity.h"
@@ -117,8 +118,8 @@ void vp9_caq_select_segment(VP9_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
   const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
-  const int xmis = MIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
-  const int ymis = MIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
   int x, y;
   int i;
   unsigned char segment;
@@ -136,7 +137,7 @@ void vp9_caq_select_segment(VP9_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
 
     vpx_clear_system_state();
     low_var_thresh = (cpi->oxcf.pass == 2)
-      ? MAX(cpi->twopass.mb_av_energy, MIN_DEFAULT_LV_THRESH)
+      ? VPXMAX(cpi->twopass.mb_av_energy, MIN_DEFAULT_LV_THRESH)
       : DEFAULT_LV_THRESH;
 
     vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index e6b36861..2cd89c0d 100644
--- a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -11,6 +11,7 @@
 #include <limits.h>
 #include <math.h>
 
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/system_state.h"
 
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
@@ -20,46 +21,9 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_segmentation.h"
 
-struct CYCLIC_REFRESH {
-  // Percentage of blocks per frame that are targeted as candidates
-  // for cyclic refresh.
-  int percent_refresh;
-  // Maximum q-delta as percentage of base q.
-  int max_qdelta_perc;
-  // Superblock starting index for cycling through the frame.
-  int sb_index;
-  // Controls how long block will need to wait to be refreshed again, in
-  // excess of the cycle time, i.e., in the case of all zero motion, block
-  // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
-  int time_for_refresh;
-  // Target number of (8x8) blocks that are set for delta-q.
-  int target_num_seg_blocks;
-  // Actual number of (8x8) blocks that were applied delta-q.
-  int actual_num_seg1_blocks;
-  int actual_num_seg2_blocks;
-  // RD mult. parameters for segment 1.
-  int rdmult;
-  // Cyclic refresh map.
-  signed char *map;
-  // Map of the last q a block was coded at.
-  uint8_t *last_coded_q_map;
-  // Thresholds applied to the projected rate/distortion of the coding block,
-  // when deciding whether block should be refreshed.
-  int64_t thresh_rate_sb;
-  int64_t thresh_dist_sb;
-  // Threshold applied to the motion vector (in units of 1/8 pel) of the
-  // coding block, when deciding whether block should be refreshed.
-  int16_t motion_thresh;
-  // Rate target ratio to set q delta.
-  double rate_ratio_qdelta;
-  // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
-  int rate_boost_fac;
-  double low_content_avg;
-  int qindex_delta[3];
-};
-
 CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
   size_t last_coded_q_map_size;
+  size_t consec_zero_mv_size;
   CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
   if (cr == NULL)
     return NULL;
@@ -78,12 +42,20 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
   assert(MAXQ <= 255);
   memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
 
+  consec_zero_mv_size = mi_rows * mi_cols * sizeof(*cr->consec_zero_mv);
+  cr->consec_zero_mv = vpx_malloc(consec_zero_mv_size);
+  if (cr->consec_zero_mv == NULL) {
+    vpx_free(cr);
+    return NULL;
+  }
+  memset(cr->consec_zero_mv, 0, consec_zero_mv_size);
   return cr;
 }
 
 void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
   vpx_free(cr->map);
   vpx_free(cr->last_coded_q_map);
+  vpx_free(cr->consec_zero_mv);
   vpx_free(cr);
 }
 
@@ -195,7 +167,8 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i,
   int num8x8bl = cm->MBs << 2;
   // Weight for segment prior to encoding: take the average of the target
   // number for the frame to be encoded and the actual from the previous frame.
-  double weight_segment = (double)((cr->target_num_seg_blocks +
+  int target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  double weight_segment = (double)((target_refresh +
       cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) >> 1) /
       num8x8bl;
   // Compute delta-q corresponding to qindex i.
@@ -223,8 +196,8 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int bw = num_8x8_blocks_wide_lookup[bsize];
   const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
   const int block_index = mi_row * cm->mi_cols + mi_col;
   const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist,
                                                       bsize);
@@ -236,7 +209,7 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
   // segment_id.
   if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
     mbmi->segment_id = refresh_this_block;
-    // Reset segment_id if will be skipped.
+    // Reset segment_id if it will be skipped.
     if (skip)
       mbmi->segment_id = CR_SEGMENT_ID_BASE;
   }
@@ -265,14 +238,48 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
       int map_offset = block_index + y * cm->mi_cols + x;
       cr->map[map_offset] = new_map_value;
       cpi->segmentation_map[map_offset] = mbmi->segment_id;
+    }
+}
+
+void vp9_cyclic_refresh_update_sb_postencode(VP9_COMP *const cpi,
+                                             const MB_MODE_INFO *const mbmi,
+                                             int mi_row, int mi_col,
+                                             BLOCK_SIZE bsize) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  MV mv = mbmi->mv[0].as_mv;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+  const int block_index = mi_row * cm->mi_cols + mi_col;
+  int x, y;
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++) {
+      int map_offset = block_index + y * cm->mi_cols + x;
       // Inter skip blocks were clearly not coded at the current qindex, so
       // don't update the map for them. For cases where motion is non-zero or
       // the reference frame isn't the previous frame, the previous value in
       // the map for this spatial location is not entirely correct.
-      if (!is_inter_block(mbmi) || !skip)
+      if ((!is_inter_block(mbmi) || !mbmi->skip) &&
+          mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
         cr->last_coded_q_map[map_offset] = clamp(
             cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
+      } else if (is_inter_block(mbmi) && mbmi->skip &&
+                 mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+        cr->last_coded_q_map[map_offset] = VPXMIN(
+            clamp(cm->base_qindex + cr->qindex_delta[mbmi->segment_id],
+                  0, MAXQ),
+            cr->last_coded_q_map[map_offset]);
+      // Update the consecutive zero/low_mv count.
+      if (is_inter_block(mbmi) && (abs(mv.row) < 8 && abs(mv.col) < 8)) {
+        if (cr->consec_zero_mv[map_offset] < 255)
+          cr->consec_zero_mv[map_offset]++;
+      } else {
+        cr->consec_zero_mv[map_offset] = 0;
+      }
     }
+  }
 }
 
 // Update the actual number of blocks that were applied the segment delta q.
@@ -389,6 +396,10 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
   unsigned char *const seg_map = cpi->segmentation_map;
   int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
   int xmis, ymis, x, y;
+  int consec_zero_mv_thresh = 0;
+  int qindex_thresh = 0;
+  int count_sel = 0;
+  int count_tot = 0;
   memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
   sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
   sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
@@ -401,6 +412,12 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
   assert(cr->sb_index < sbs_in_frame);
   i = cr->sb_index;
   cr->target_num_seg_blocks = 0;
+  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
+    consec_zero_mv_thresh = 100;
+  qindex_thresh =
+      cpi->oxcf.content == VP9E_CONTENT_SCREEN
+      ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
+      : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
   do {
     int sum_map = 0;
     // Get the mi_row/mi_col corresponding to superblock index i.
@@ -408,18 +425,14 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
     int sb_col_index = i - sb_row_index * sb_cols;
     int mi_row = sb_row_index * MI_BLOCK_SIZE;
     int mi_col = sb_col_index * MI_BLOCK_SIZE;
-    int qindex_thresh =
-        cpi->oxcf.content == VP9E_CONTENT_SCREEN
-            ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
-            : 0;
     assert(mi_row >= 0 && mi_row < cm->mi_rows);
     assert(mi_col >= 0 && mi_col < cm->mi_cols);
     bl_index = mi_row * cm->mi_cols + mi_col;
     // Loop through all 8x8 blocks in superblock and update map.
-    xmis = MIN(cm->mi_cols - mi_col,
-               num_8x8_blocks_wide_lookup[BLOCK_64X64]);
-    ymis = MIN(cm->mi_rows - mi_row,
-               num_8x8_blocks_high_lookup[BLOCK_64X64]);
+    xmis =
+        VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]);
+    ymis =
+        VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
     for (y = 0; y < ymis; y++) {
       for (x = 0; x < xmis; x++) {
         const int bl_index2 = bl_index + y * cm->mi_cols + x;
@@ -427,8 +440,12 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
         // for possible boost/refresh (segment 1). The segment id may get
         // reset to 0 later if block gets coded anything other than ZEROMV.
         if (cr->map[bl_index2] == 0) {
-          if (cr->last_coded_q_map[bl_index2] > qindex_thresh)
+          count_tot++;
+          if (cr->last_coded_q_map[bl_index2] > qindex_thresh ||
+              cr->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) {
             sum_map++;
+            count_sel++;
+          }
         } else if (cr->map[bl_index2] < 0) {
           cr->map[bl_index2]++;
         }
@@ -449,6 +466,9 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
     }
   } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
   cr->sb_index = i;
+  cr->reduce_refresh = 0;
+  if (count_sel < (3 * count_tot) >> 2)
+    cr->reduce_refresh = 1;
 }
 
 // Set cyclic refresh parameters.
@@ -457,6 +477,8 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   cr->percent_refresh = 10;
+  if (cr->reduce_refresh)
+    cr->percent_refresh = 5;
   cr->max_qdelta_perc = 50;
   cr->time_for_refresh = 0;
   // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
@@ -476,7 +498,11 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
     cr->rate_boost_fac = 10;
   } else {
     cr->motion_thresh = 32;
-    cr->rate_boost_fac = 17;
+    cr->rate_boost_fac = 15;
+  }
+  if (cpi->svc.spatial_layer_id > 0) {
+    cr->motion_thresh = 4;
+    cr->rate_boost_fac = 12;
   }
 }
 
@@ -489,11 +515,10 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
   const int apply_cyclic_refresh  = apply_cyclic_refresh_bitrate(cm, rc);
   if (cm->current_video_frame == 0)
     cr->low_content_avg = 0.0;
-  // Don't apply refresh on key frame or enhancement layer frames.
+  // Don't apply refresh on key frame or temporal enhancement layer frames.
   if (!apply_cyclic_refresh ||
       (cm->frame_type == KEY_FRAME) ||
-      (cpi->svc.temporal_layer_id > 0) ||
-      (cpi->svc.spatial_layer_id > 0)) {
+      (cpi->svc.temporal_layer_id > 0)) {
     // Set segmentation map to 0 and disable.
     unsigned char *const seg_map = cpi->segmentation_map;
     memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
@@ -501,6 +526,8 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
     if (cm->frame_type == KEY_FRAME) {
       memset(cr->last_coded_q_map, MAXQ,
              cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
+      memset(cr->consec_zero_mv, 0,
+             cm->mi_rows * cm->mi_cols * sizeof(*cr->consec_zero_mv));
       cr->sb_index = 0;
     }
     return;
@@ -551,11 +578,16 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
 
     // Set a more aggressive (higher) q delta for segment BOOST2.
     qindex_delta = compute_deltaq(
-        cpi, cm->base_qindex, MIN(CR_MAX_RATE_TARGET_RATIO,
-        0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+        cpi, cm->base_qindex,
+        VPXMIN(CR_MAX_RATE_TARGET_RATIO,
+               0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
     cr->qindex_delta[2] = qindex_delta;
     vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
 
+    // Reset if resoluton change has occurred.
+    if (cpi->resize_pending != 0)
+      vp9_cyclic_refresh_reset_resize(cpi);
+
     // Update the segmentation and refresh map.
     cyclic_refresh_update_map(cpi);
   }
@@ -569,6 +601,8 @@ void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
+  memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols);
+  memset(cr->consec_zero_mv, 0, cm->mi_rows * cm->mi_cols);
   cr->sb_index = 0;
   cpi->refresh_golden_frame = 1;
 }
diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
index 29d2a91b..a5b38138 100644
--- a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -12,6 +12,7 @@
 #ifndef VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
 #define VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
 
+#include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
 #ifdef __cplusplus
@@ -27,9 +28,49 @@ extern "C" {
 // Maximum rate target ratio for setting segment delta-qp.
 #define CR_MAX_RATE_TARGET_RATIO 4.0
 
+struct CYCLIC_REFRESH {
+  // Percentage of blocks per frame that are targeted as candidates
+  // for cyclic refresh.
+  int percent_refresh;
+  // Maximum q-delta as percentage of base q.
+  int max_qdelta_perc;
+  // Superblock starting index for cycling through the frame.
+  int sb_index;
+  // Controls how long block will need to wait to be refreshed again, in
+  // excess of the cycle time, i.e., in the case of all zero motion, block
+  // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+  int time_for_refresh;
+  // Target number of (8x8) blocks that are set for delta-q.
+  int target_num_seg_blocks;
+  // Actual number of (8x8) blocks that were applied delta-q.
+  int actual_num_seg1_blocks;
+  int actual_num_seg2_blocks;
+  // RD mult. parameters for segment 1.
+  int rdmult;
+  // Cyclic refresh map.
+  signed char *map;
+  // Map of the last q a block was coded at.
+  uint8_t *last_coded_q_map;
+  // Count on how many consecutive times a block uses ZER0MV for encoding.
+  uint8_t *consec_zero_mv;
+  // Thresholds applied to the projected rate/distortion of the coding block,
+  // when deciding whether block should be refreshed.
+  int64_t thresh_rate_sb;
+  int64_t thresh_dist_sb;
+  // Threshold applied to the motion vector (in units of 1/8 pel) of the
+  // coding block, when deciding whether block should be refreshed.
+  int16_t motion_thresh;
+  // Rate target ratio to set q delta.
+  double rate_ratio_qdelta;
+  // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+  int rate_boost_fac;
+  double low_content_avg;
+  int qindex_delta[3];
+  int reduce_refresh;
+};
+
 struct VP9_COMP;
 
-struct CYCLIC_REFRESH;
 typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
 
 CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols);
@@ -54,6 +95,11 @@ void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
                                        int64_t rate, int64_t dist, int skip);
 
+void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi,
+                                             const MB_MODE_INFO *const mbmi,
+                                             int mi_row, int mi_col,
+                                             BLOCK_SIZE bsize);
+
 // Update the segmentation map, and related quantities: cyclic refresh map,
 // refresh sb_index, and target number of blocks to be refreshed.
 void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi);
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index d0de095a..46155543 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -14,6 +14,7 @@
 
 #include "vpx/vpx_encoder.h"
 #include "vpx_dsp/bitwriter_buffer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/system_state.h"
@@ -175,12 +176,10 @@ static void pack_mb_tokens(vpx_writer *w,
         const unsigned char *pb = b->prob;
         int v = e >> 1;
         int n = l;              /* number of bits in v, assumed nonzero */
-        int i = 0;
 
         do {
           const int bb = (v >> --n) & 1;
-          vpx_write(w, bb, pb[i >> 1]);
-          i = b->tree[i + bb];
+          vpx_write(w, bb, *pb++);
         } while (n);
       }
 
@@ -815,7 +814,7 @@ static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd,
 static void encode_txfm_probs(VP9_COMMON *cm, vpx_writer *w,
                               FRAME_COUNTS *counts) {
   // Mode
-  vpx_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
+  vpx_write_literal(w, VPXMIN(cm->tx_mode, ALLOW_32X32), 2);
   if (cm->tx_mode >= ALLOW_32X32)
     vpx_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
 
@@ -968,14 +967,14 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   return total_size;
 }
 
-static void write_display_size(const VP9_COMMON *cm,
-                               struct vpx_write_bit_buffer *wb) {
-  const int scaling_active = cm->width != cm->display_width ||
-                             cm->height != cm->display_height;
+static void write_render_size(const VP9_COMMON *cm,
+                              struct vpx_write_bit_buffer *wb) {
+  const int scaling_active = cm->width != cm->render_width ||
+                             cm->height != cm->render_height;
   vpx_wb_write_bit(wb, scaling_active);
   if (scaling_active) {
-    vpx_wb_write_literal(wb, cm->display_width - 1, 16);
-    vpx_wb_write_literal(wb, cm->display_height - 1, 16);
+    vpx_wb_write_literal(wb, cm->render_width - 1, 16);
+    vpx_wb_write_literal(wb, cm->render_height - 1, 16);
   }
 }
 
@@ -984,7 +983,7 @@ static void write_frame_size(const VP9_COMMON *cm,
   vpx_wb_write_literal(wb, cm->width - 1, 16);
   vpx_wb_write_literal(wb, cm->height - 1, 16);
 
-  write_display_size(cm, wb);
+  write_render_size(cm, wb);
 }
 
 static void write_frame_size_with_refs(VP9_COMP *cpi,
@@ -1022,7 +1021,7 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
     vpx_wb_write_literal(wb, cm->height - 1, 16);
   }
 
-  write_display_size(cm, wb);
+  write_render_size(cm, wb);
 }
 
 static void write_sync_code(struct vpx_write_bit_buffer *wb) {
@@ -1059,7 +1058,8 @@ static void write_bitdepth_colorspace_sampling(
   }
   vpx_wb_write_literal(wb, cm->color_space, 3);
   if (cm->color_space != VPX_CS_SRGB) {
-    vpx_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    vpx_wb_write_bit(wb, cm->color_range);
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
       assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
       vpx_wb_write_bit(wb, cm->subsampling_x);
diff --git a/libvpx/vp9/encoder/vp9_context_tree.c b/libvpx/vp9/encoder/vp9_context_tree.c
index e87cccba..396ed3fe 100644
--- a/libvpx/vp9/encoder/vp9_context_tree.c
+++ b/libvpx/vp9/encoder/vp9_context_tree.c
@@ -30,13 +30,13 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     for (k = 0; k < 3; ++k) {
       CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
+                      vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
+                      vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
+                      vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
-                      vpx_memalign(16, num_blk * sizeof(*ctx->eobs[i][k])));
+                      vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
       ctx->coeff_pbuf[i][k]   = ctx->coeff[i][k];
       ctx->qcoeff_pbuf[i][k]  = ctx->qcoeff[i][k];
       ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
diff --git a/libvpx/vp9/encoder/vp9_context_tree.h b/libvpx/vp9/encoder/vp9_context_tree.h
index ac244977..8e365ce3 100644
--- a/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/libvpx/vp9/encoder/vp9_context_tree.h
@@ -14,6 +14,10 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/encoder/vp9_block.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct VP9_COMP;
 struct VP9Common;
 struct ThreadData;
@@ -84,4 +88,8 @@ typedef struct PC_TREE {
 void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td);
 void vp9_free_pc_tree(struct ThreadData *td);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */
diff --git a/libvpx/vp9/encoder/vp9_denoiser.c b/libvpx/vp9/encoder/vp9_denoiser.c
index 5f992856..8623b422 100644
--- a/libvpx/vp9/encoder/vp9_denoiser.c
+++ b/libvpx/vp9/encoder/vp9_denoiser.c
@@ -10,19 +10,18 @@
 
 #include <assert.h>
 #include <limits.h>
+#include <math.h>
+
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_denoiser.h"
+#include "vp9/encoder/vp9_encoder.h"
 
-/* The VP9 denoiser is a work-in-progress. It currently is only designed to work
- * with speed 6, though it (inexplicably) seems to also work with speed 5 (one
- * would need to modify the source code in vp9_pickmode.c and vp9_encoder.c to
- * make the calls to the vp9_denoiser_* functions when in speed 5).
- *
- * The implementation is very similar to that of the VP8 denoiser. While
+/* The VP9 denoiser is similar to that of the VP8 denoiser. While
  * choosing the motion vectors / reference frames, the denoiser is run, and if
  * it did not modify the signal to much, the denoised block is copied to the
  * signal.
@@ -120,10 +119,10 @@ int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride,
             adj = adj_val[2];
         }
         if (diff > 0) {
-          avg[c] = MIN(UINT8_MAX, sig[c] + adj);
+          avg[c] = VPXMIN(UINT8_MAX, sig[c] + adj);
           total_adj += adj;
         } else {
-          avg[c] = MAX(0, sig[c] - adj);
+          avg[c] = VPXMAX(0, sig[c] - adj);
           total_adj -= adj;
         }
       }
@@ -160,13 +159,13 @@ int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride,
         // Diff positive means we made positive adjustment above
         // (in first try/attempt), so now make negative adjustment to bring
         // denoised signal down.
-        avg[c] = MAX(0, avg[c] - adj);
+        avg[c] = VPXMAX(0, avg[c] - adj);
         total_adj -= adj;
       } else {
         // Diff negative means we made negative adjustment above
         // (in first try/attempt), so now make positive adjustment to bring
         // denoised signal up.
-        avg[c] = MIN(UINT8_MAX, avg[c] + adj);
+        avg[c] = VPXMIN(UINT8_MAX, avg[c] + adj);
         total_adj += adj;
       }
     }
@@ -194,8 +193,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
                                                          int mi_row,
                                                          int mi_col,
                                                          PICK_MODE_CONTEXT *ctx,
-                                                         int *motion_magnitude
-                                                         ) {
+                                                         int *motion_magnitude,
+                                                         int is_skin) {
   int mv_col, mv_row;
   int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
   MV_REFERENCE_FRAME frame;
@@ -213,6 +212,9 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
 
   saved_mbmi = *mbmi;
 
+  if (is_skin && *motion_magnitude > 16)
+    return COPY_BLOCK;
+
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
   if (frame != INTRA_FRAME &&
@@ -312,18 +314,38 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,
                           PICK_MODE_CONTEXT *ctx) {
   int motion_magnitude = 0;
-  VP9_DENOISER_DECISION decision = FILTER_BLOCK;
+  VP9_DENOISER_DECISION decision = COPY_BLOCK;
   YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
   YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
   uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
   uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride,
                                           mi_row, mi_col);
   struct buf_2d src = mb->plane[0].src;
+  int is_skin = 0;
+
+  if (bs <= BLOCK_16X16 && denoiser->denoising_on) {
+    // Take center pixel in block to determine is_skin.
+    const int y_width_shift = (4 << b_width_log2_lookup[bs]) >> 1;
+    const int y_height_shift = (4 << b_height_log2_lookup[bs]) >> 1;
+    const int uv_width_shift = y_width_shift >> 1;
+    const int uv_height_shift = y_height_shift >> 1;
+    const int stride = mb->plane[0].src.stride;
+    const int strideuv = mb->plane[1].src.stride;
+    const uint8_t ysource =
+      mb->plane[0].src.buf[y_height_shift * stride + y_width_shift];
+    const uint8_t usource =
+      mb->plane[1].src.buf[uv_height_shift * strideuv + uv_width_shift];
+    const uint8_t vsource =
+      mb->plane[2].src.buf[uv_height_shift * strideuv + uv_width_shift];
+    is_skin = vp9_skin_pixel(ysource, usource, vsource);
+  }
 
-  decision = perform_motion_compensation(denoiser, mb, bs,
-                                         denoiser->increase_denoising,
-                                         mi_row, mi_col, ctx,
-                                         &motion_magnitude);
+  if (denoiser->denoising_on)
+    decision = perform_motion_compensation(denoiser, mb, bs,
+                                           denoiser->increase_denoising,
+                                           mi_row, mi_col, ctx,
+                                           &motion_magnitude,
+                                           is_skin);
 
   if (decision == FILTER_BLOCK) {
     decision = vp9_denoiser_filter(src.buf, src.stride,
@@ -345,23 +367,24 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
   }
 }
 
-static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) {
+static void copy_frame(YV12_BUFFER_CONFIG * const dest,
+                       const YV12_BUFFER_CONFIG * const src) {
   int r;
-  const uint8_t *srcbuf = src.y_buffer;
-  uint8_t *destbuf = dest.y_buffer;
+  const uint8_t *srcbuf = src->y_buffer;
+  uint8_t *destbuf = dest->y_buffer;
 
-  assert(dest.y_width == src.y_width);
-  assert(dest.y_height == src.y_height);
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
 
-  for (r = 0; r < dest.y_height; ++r) {
-    memcpy(destbuf, srcbuf, dest.y_width);
-    destbuf += dest.y_stride;
-    srcbuf += src.y_stride;
+  for (r = 0; r < dest->y_height; ++r) {
+    memcpy(destbuf, srcbuf, dest->y_width);
+    destbuf += dest->y_stride;
+    srcbuf += src->y_stride;
   }
 }
 
-static void swap_frame_buffer(YV12_BUFFER_CONFIG *dest,
-                              YV12_BUFFER_CONFIG *src) {
+static void swap_frame_buffer(YV12_BUFFER_CONFIG * const dest,
+                              YV12_BUFFER_CONFIG * const src) {
   uint8_t *tmp_buf = dest->y_buffer;
   assert(dest->y_width == src->y_width);
   assert(dest->y_height == src->y_height);
@@ -374,27 +397,46 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     FRAME_TYPE frame_type,
                                     int refresh_alt_ref_frame,
                                     int refresh_golden_frame,
-                                    int refresh_last_frame) {
-  if (frame_type == KEY_FRAME) {
+                                    int refresh_last_frame,
+                                    int resized) {
+  // Copy source into denoised reference buffers on KEY_FRAME or
+  // if the just encoded frame was resized.
+  if (frame_type == KEY_FRAME || resized != 0) {
     int i;
     // Start at 1 so as not to overwrite the INTRA_FRAME
     for (i = 1; i < MAX_REF_FRAMES; ++i)
-      copy_frame(denoiser->running_avg_y[i], src);
+      copy_frame(&denoiser->running_avg_y[i], &src);
     return;
   }
 
-  /* For non key frames */
-  if (refresh_alt_ref_frame) {
-    swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
-                      &denoiser->running_avg_y[INTRA_FRAME]);
-  }
-  if (refresh_golden_frame) {
-    swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
-                      &denoiser->running_avg_y[INTRA_FRAME]);
-  }
-  if (refresh_last_frame) {
-    swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
-                      &denoiser->running_avg_y[INTRA_FRAME]);
+  // If more than one refresh occurs, must copy frame buffer.
+  if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame)
+      > 1) {
+    if (refresh_alt_ref_frame) {
+      copy_frame(&denoiser->running_avg_y[ALTREF_FRAME],
+                 &denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_golden_frame) {
+      copy_frame(&denoiser->running_avg_y[GOLDEN_FRAME],
+                 &denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_last_frame) {
+      copy_frame(&denoiser->running_avg_y[LAST_FRAME],
+                 &denoiser->running_avg_y[INTRA_FRAME]);
+    }
+  } else {
+    if (refresh_alt_ref_frame) {
+      swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
+                        &denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_golden_frame) {
+      swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
+                        &denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_last_frame) {
+      swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
+                        &denoiser->running_avg_y[INTRA_FRAME]);
+    }
   }
 }
 
@@ -456,15 +498,43 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
     vp9_denoiser_free(denoiser);
     return 1;
   }
+
+  fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height,
+                                ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                use_highbitdepth,
+#endif
+                                border, legacy_byte_alignment);
+  if (fail) {
+    vp9_denoiser_free(denoiser);
+    return 1;
+  }
 #ifdef OUTPUT_YUV_DENOISED
   make_grayscale(&denoiser->running_avg_y[i]);
 #endif
   denoiser->increase_denoising = 0;
   denoiser->frame_buffer_initialized = 1;
-
+  vp9_denoiser_init_noise_estimate(denoiser, width, height);
   return 0;
 }
 
+void vp9_denoiser_init_noise_estimate(VP9_DENOISER *denoiser,
+                                      int width,
+                                      int height) {
+  // Denoiser is off by default, i.e., no denoising is performed.
+  // Noise level is measured periodically, and if observed to be above
+  // thresh_noise_estimate, then denoising is performed, i.e., denoising_on = 1.
+  denoiser->denoising_on = 0;
+  denoiser->noise_estimate = 0;
+  denoiser->noise_estimate_count = 0;
+  denoiser->thresh_noise_estimate = 20;
+  if (width * height >= 1920 * 1080) {
+    denoiser->thresh_noise_estimate = 70;
+  } else if (width * height >= 1280 * 720) {
+    denoiser->thresh_noise_estimate = 40;
+  }
+}
+
 void vp9_denoiser_free(VP9_DENOISER *denoiser) {
   int i;
   denoiser->frame_buffer_initialized = 0;
@@ -475,6 +545,120 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) {
     vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
   }
   vpx_free_frame_buffer(&denoiser->mc_running_avg_y);
+  vpx_free_frame_buffer(&denoiser->last_source);
+}
+
+void vp9_denoiser_update_noise_estimate(VP9_COMP *const cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int frame_period = 10;
+  int thresh_consec_zeromv = 8;
+  unsigned int thresh_sum_diff = 128;
+  int num_frames_estimate = 20;
+  int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7;
+  // Estimate of noise level every frame_period frames.
+  // Estimate is between current source and last source.
+  if (cm->current_video_frame % frame_period != 0 ||
+     cpi->denoiser.last_source.y_buffer == NULL) {
+    copy_frame(&cpi->denoiser.last_source, cpi->Source);
+    return;
+  } else {
+    int num_samples = 0;
+    uint64_t avg_est = 0;
+    int bsize = BLOCK_16X16;
+    static const unsigned char const_source[16] = {
+         128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+         128, 128};
+    // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
+    // been encoded as zero/small mv at least x consecutive frames, compute
+    // the variance to update estimate of noise in the source.
+    const uint8_t *src_y = cpi->Source->y_buffer;
+    const int src_ystride = cpi->Source->y_stride;
+    const uint8_t *last_src_y = cpi->denoiser.last_source.y_buffer;
+    const int last_src_ystride = cpi->denoiser.last_source.y_stride;
+    const uint8_t *src_u = cpi->Source->u_buffer;
+    const uint8_t *src_v = cpi->Source->v_buffer;
+    const int src_uvstride = cpi->Source->uv_stride;
+    const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
+    const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
+    const int uv_width_shift = y_width_shift >> 1;
+    const int uv_height_shift = y_height_shift >> 1;
+    int mi_row, mi_col;
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row ++) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col ++) {
+        // 16x16 blocks, 1/4 sample of frame.
+        if (mi_row % 4 == 0 && mi_col % 4 == 0) {
+          int bl_index = mi_row * cm->mi_cols + mi_col;
+          int bl_index1 = bl_index + 1;
+          int bl_index2 = bl_index + cm->mi_cols;
+          int bl_index3 = bl_index2 + 1;
+          // Only consider blocks that are likely steady background. i.e, have
+          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+          // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
+          const uint8_t ysource =
+            src_y[y_height_shift * src_ystride + y_width_shift];
+          const uint8_t usource =
+            src_u[uv_height_shift * src_uvstride + uv_width_shift];
+          const uint8_t vsource =
+            src_v[uv_height_shift * src_uvstride + uv_width_shift];
+          int is_skin = vp9_skin_pixel(ysource, usource, vsource);
+          if (cr->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
+              cr->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
+              cr->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
+              cr->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
+              !is_skin) {
+            // Compute variance.
+            unsigned int sse;
+            unsigned int variance = cpi->fn_ptr[bsize].vf(src_y,
+                                                          src_ystride,
+                                                          last_src_y,
+                                                          last_src_ystride,
+                                                          &sse);
+            // Only consider this block as valid for noise measurement if the
+            // average term (sse - variance = N * avg^{2}, N = 16X16) of the
+            // temporal residual is small (avoid effects from lighting change).
+            if ((sse - variance) < thresh_sum_diff) {
+              unsigned int sse2;
+              const unsigned int spatial_variance =
+                  cpi->fn_ptr[bsize].vf(src_y, src_ystride, const_source,
+                                        0, &sse2);
+              avg_est += variance / (10 + spatial_variance);
+              num_samples++;
+            }
+          }
+        }
+        src_y += 8;
+        last_src_y += 8;
+        src_u += 4;
+        src_v += 4;
+      }
+      src_y += (src_ystride << 3) - (cm->mi_cols << 3);
+      last_src_y += (last_src_ystride << 3) - (cm->mi_cols << 3);
+      src_u += (src_uvstride << 2) - (cm->mi_cols << 2);
+      src_v += (src_uvstride << 2) - (cm->mi_cols << 2);
+    }
+    // Update noise estimate if we have at a minimum number of block samples,
+    // and avg_est > 0 (avg_est == 0 can happen if the application inputs
+    // duplicate frames).
+    if (num_samples > min_blocks_estimate && avg_est > 0) {
+      // Normalize.
+      avg_est = (avg_est << 8) / num_samples;
+      // Update noise estimate.
+      cpi->denoiser.noise_estimate =  (3 * cpi->denoiser.noise_estimate +
+          avg_est) >> 2;
+      cpi->denoiser.noise_estimate_count++;
+      if (cpi->denoiser.noise_estimate_count == num_frames_estimate) {
+        // Reset counter and check noise level condition.
+        cpi->denoiser.noise_estimate_count = 0;
+       if (cpi->denoiser.noise_estimate > cpi->denoiser.thresh_noise_estimate)
+         cpi->denoiser.denoising_on = 1;
+       else
+         cpi->denoiser.denoising_on = 0;
+      }
+    }
+  }
+  copy_frame(&cpi->denoiser.last_source, cpi->Source);
 }
 
 #ifdef OUTPUT_YUV_DENOISED
diff --git a/libvpx/vp9/encoder/vp9_denoiser.h b/libvpx/vp9/encoder/vp9_denoiser.h
index b2af792b..f8ad4acd 100644
--- a/libvpx/vp9/encoder/vp9_denoiser.h
+++ b/libvpx/vp9/encoder/vp9_denoiser.h
@@ -12,6 +12,7 @@
 #define VP9_ENCODER_DENOISER_H_
 
 #include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
 #include "vpx_scale/yv12config.h"
 
 #ifdef __cplusplus
@@ -28,16 +29,24 @@ typedef enum vp9_denoiser_decision {
 typedef struct vp9_denoiser {
   YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
   YV12_BUFFER_CONFIG mc_running_avg_y;
+  YV12_BUFFER_CONFIG last_source;
   int increase_denoising;
   int frame_buffer_initialized;
+  int denoising_on;
+  int noise_estimate;
+  int thresh_noise_estimate;
+  int noise_estimate_count;
 } VP9_DENOISER;
 
+struct VP9_COMP;
+
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     YV12_BUFFER_CONFIG src,
                                     FRAME_TYPE frame_type,
                                     int refresh_alt_ref_frame,
                                     int refresh_golden_frame,
-                                    int refresh_last_frame);
+                                    int refresh_last_frame,
+                                    int resized);
 
 void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,
@@ -67,6 +76,12 @@ static int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
 
 void vp9_denoiser_free(VP9_DENOISER *denoiser);
 
+void vp9_denoiser_init_noise_estimate(VP9_DENOISER *denoiser,
+                                      int width,
+                                      int height);
+
+void vp9_denoiser_update_noise_estimate(struct VP9_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index 295a7512..2333a139 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -16,6 +16,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_ports/system_state.h"
@@ -979,8 +980,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td,
   const struct segmentation *const seg = &cm->seg;
   const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
   const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
-  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
   MV_REF *const frame_mvs =
       cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
   int w, h;
@@ -1132,8 +1133,8 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
 
   mbmi->sb_type = bsize;
   mbmi->mode = ZEROMV;
-  mbmi->tx_size = MIN(max_txsize_lookup[bsize],
-                      tx_mode_to_biggest_tx_size[tx_mode]);
+  mbmi->tx_size =
+      VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[tx_mode]);
   mbmi->skip = 1;
   mbmi->uv_mode = DC_PRED;
   mbmi->ref_frame[0] = LAST_FRAME;
@@ -1496,7 +1497,7 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
                                       int rows_left, int cols_left,
                                       int *bh, int *bw) {
   if (rows_left <= 0 || cols_left <= 0) {
-    return MIN(bsize, BLOCK_8X8);
+    return VPXMIN(bsize, BLOCK_8X8);
   } else {
     for (; bsize > 0; bsize -= 3) {
       *bh = num_8x8_blocks_high_lookup[bsize];
@@ -1672,8 +1673,8 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
   const struct segmentation *const seg = &cm->seg;
   const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
   const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
-  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
 
   *(xd->mi[0]) = ctx->mic;
   *(x->mbmi_ext) = ctx->mbmi_ext;
@@ -1738,10 +1739,12 @@ static void encode_b_rt(VP9_COMP *cpi, ThreadData *td,
   update_state_rt(cpi, td, ctx, mi_row, mi_col, bsize);
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && output_enabled &&
-      cpi->common.frame_type != KEY_FRAME) {
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      output_enabled &&
+      cpi->common.frame_type != KEY_FRAME &&
+      cpi->resize_pending == 0) {
     vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
-                         MAX(BLOCK_8X8, bsize), ctx);
+                         VPXMAX(BLOCK_8X8, bsize), ctx);
   }
 #endif
 
@@ -2133,8 +2136,8 @@ static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
       MODE_INFO *mi = mi_8x8[index+j];
       BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0;
       bs_hist[sb_type]++;
-      *min_block_size = MIN(*min_block_size, sb_type);
-      *max_block_size = MAX(*max_block_size, sb_type);
+      *min_block_size = VPXMIN(*min_block_size, sb_type);
+      *max_block_size = VPXMAX(*max_block_size, sb_type);
     }
     index += xd->mi_stride;
   }
@@ -2211,8 +2214,8 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
   if (vp9_active_edge_sb(cpi, mi_row, mi_col)) {
     min_size = BLOCK_4X4;
   } else {
-    min_size = MIN(cpi->sf.rd_auto_partition_min_limit,
-                   MIN(min_size, max_size));
+    min_size =
+        VPXMIN(cpi->sf.rd_auto_partition_min_limit, VPXMIN(min_size, max_size));
   }
 
   // When use_square_partition_only is true, make sure at least one square
@@ -2248,8 +2251,8 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
       for (idx = 0; idx < mi_width; ++idx) {
         mi = prev_mi[idy * cm->mi_stride + idx];
         bs = mi ? mi->mbmi.sb_type : bsize;
-        min_size = MIN(min_size, bs);
-        max_size = MAX(max_size, bs);
+        min_size = VPXMIN(min_size, bs);
+        max_size = VPXMAX(max_size, bs);
       }
     }
   }
@@ -2258,8 +2261,8 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
     for (idy = 0; idy < mi_height; ++idy) {
       mi = xd->mi[idy * cm->mi_stride - 1];
       bs = mi ? mi->mbmi.sb_type : bsize;
-      min_size = MIN(min_size, bs);
-      max_size = MAX(max_size, bs);
+      min_size = VPXMIN(min_size, bs);
+      max_size = VPXMAX(max_size, bs);
     }
   }
 
@@ -2267,8 +2270,8 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
     for (idx = 0; idx < mi_width; ++idx) {
       mi = xd->mi[idx - cm->mi_stride];
       bs = mi ? mi->mbmi.sb_type : bsize;
-      min_size = MIN(min_size, bs);
-      max_size = MAX(max_size, bs);
+      min_size = VPXMIN(min_size, bs);
+      max_size = VPXMAX(max_size, bs);
     }
   }
 
@@ -2376,11 +2379,20 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
                                bsize >= BLOCK_8X8;
   int partition_vert_allowed = !force_horz_split && xss <= yss &&
                                bsize >= BLOCK_8X8;
-  (void) *tp_orig;
+
+  int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr;
+  int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr;
+
+  (void)*tp_orig;
 
   assert(num_8x8_blocks_wide_lookup[bsize] ==
              num_8x8_blocks_high_lookup[bsize]);
 
+  // Adjust dist breakout threshold according to the partition size.
+  dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
+      b_height_log2_lookup[bsize]);
+  rate_breakout_thr *= num_pels_log2_lookup[bsize];
+
   vp9_rd_cost_init(&this_rdc);
   vp9_rd_cost_init(&sum_rdc);
   vp9_rd_cost_reset(&best_rdc);
@@ -2409,9 +2421,11 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
                                 force_vert_split);
     do_split &= bsize > min_size;
   }
-  if (cpi->sf.use_square_partition_only) {
-    partition_horz_allowed &= force_horz_split;
-    partition_vert_allowed &= force_vert_split;
+
+  if (cpi->sf.use_square_partition_only &&
+      bsize > cpi->sf.use_square_only_threshold) {
+      partition_horz_allowed &= force_horz_split;
+      partition_vert_allowed &= force_vert_split;
   }
 
   save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2433,9 +2447,9 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     int mb_row = mi_row >> 1;
     int mb_col = mi_col >> 1;
     int mb_row_end =
-        MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+        VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
     int mb_col_end =
-        MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+        VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
     int r, c;
 
     // compute a complexity measure, basically measure inconsistency of motion
@@ -2488,27 +2502,17 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       }
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
-        int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr;
-        int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr;
-
         best_rdc = this_rdc;
         if (bsize >= BLOCK_8X8)
           pc_tree->partitioning = PARTITION_NONE;
 
-        // Adjust dist breakout threshold according to the partition size.
-        dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
-            b_height_log2_lookup[bsize]);
-
-        rate_breakout_thr *= num_pels_log2_lookup[bsize];
-
         // If all y, u, v transform blocks in this partition are skippable, and
         // the dist & rate are within the thresholds, the partition search is
         // terminated for current branch of the partition search tree.
-        // The dist & rate thresholds are set to 0 at speed 0 to disable the
-        // early termination at that speed.
-        if (!x->e_mbd.lossless &&
-            (ctx->skippable && best_rdc.dist < dist_breakout_thr &&
-            best_rdc.rate < rate_breakout_thr)) {
+        if (!x->e_mbd.lossless && ctx->skippable  &&
+            ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+             (best_rdc.dist < dist_breakout_thr &&
+              best_rdc.rate < rate_breakout_thr))) {
           do_split = 0;
           do_rect = 0;
         }
@@ -2524,9 +2528,9 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
           int mb_row = mi_row >> 1;
           int mb_col = mi_col >> 1;
           int mb_row_end =
-              MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+              VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
           int mb_col_end =
-              MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+              VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
           int r, c;
 
           int skip = 1;
@@ -2618,11 +2622,21 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
         pc_tree->partitioning = PARTITION_SPLIT;
+
+        // Rate and distortion based partition search termination clause.
+        if (!x->e_mbd.lossless &&
+            ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+             (best_rdc.dist < dist_breakout_thr &&
+              best_rdc.rate < rate_breakout_thr))) {
+          do_rect = 0;
+        }
       }
     } else {
       // skip rectangular partition test when larger block size
       // gives better rd cost
-      if (cpi->sf.less_rectangular_check)
+      if ((cpi->sf.less_rectangular_check) &&
+          ((bsize > cpi->sf.use_square_only_threshold) ||
+           (best_rdc.dist < dist_breakout_thr)))
         do_rect &= !partition_none_allowed;
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2631,7 +2645,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   // PARTITION_HORZ
   if (partition_horz_allowed &&
       (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) {
-      subsize = get_subsize(bsize, PARTITION_HORZ);
+    subsize = get_subsize(bsize, PARTITION_HORZ);
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, ctx);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
@@ -2672,6 +2686,10 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
         pc_tree->partitioning = PARTITION_HORZ;
+
+        if ((cpi->sf.less_rectangular_check) &&
+            (bsize > cpi->sf.use_square_only_threshold))
+          do_rect = 0;
       }
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2679,7 +2697,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   // PARTITION_VERT
   if (partition_vert_allowed &&
       (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) {
-      subsize = get_subsize(bsize, PARTITION_VERT);
+    subsize = get_subsize(bsize, PARTITION_VERT);
 
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, ctx);
@@ -2733,7 +2751,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   (void) best_rd;
   *rd_cost = best_rdc;
 
-
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
     int output_enabled = (bsize == BLOCK_64X64);
@@ -3646,7 +3663,7 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
   const int last_stride = cpi->Last_Source->y_stride;
 
   // Pick cutoff threshold
-  const int cutoff = (MIN(cm->width, cm->height) >= 720) ?
+  const int cutoff = (VPXMIN(cm->width, cm->height) >= 720) ?
       (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100) :
       (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100);
   DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]);
@@ -3947,7 +3964,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 #endif
 
     // If allowed, encoding tiles in parallel with one thread handling one tile.
-    if (MIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+    if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
       vp9_encode_tiles_mt(cpi);
     else
       encode_tiles(cpi);
@@ -4162,10 +4179,10 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
     int plane;
     mbmi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane)
-      vp9_encode_intra_block_plane(x, MAX(bsize, BLOCK_8X8), plane);
+      vp9_encode_intra_block_plane(x, VPXMAX(bsize, BLOCK_8X8), plane);
     if (output_enabled)
       sum_intra_stats(td->counts, mi);
-    vp9_tokenize_sb(cpi, td, t, !output_enabled, MAX(bsize, BLOCK_8X8));
+    vp9_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
@@ -4178,12 +4195,14 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
                            &xd->block_refs[ref]->sf);
     }
     if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip)
-      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
+      vp9_build_inter_predictors_sby(xd, mi_row, mi_col,
+                                     VPXMAX(bsize, BLOCK_8X8));
 
-    vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
+    vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col,
+                                    VPXMAX(bsize, BLOCK_8X8));
 
-    vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
-    vp9_tokenize_sb(cpi, td, t, !output_enabled, MAX(bsize, BLOCK_8X8));
+    vp9_encode_sb(x, VPXMAX(bsize, BLOCK_8X8));
+    vp9_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
   }
 
   if (output_enabled) {
@@ -4197,8 +4216,8 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
       TX_SIZE tx_size;
       // The new intra coding scheme requires no change of transform size
       if (is_inter_block(&mi->mbmi)) {
-        tx_size = MIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
-                      max_txsize_lookup[bsize]);
+        tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+                         max_txsize_lookup[bsize]);
       } else {
         tx_size = (bsize >= BLOCK_8X8) ? mbmi->tx_size : TX_4X4;
       }
@@ -4210,5 +4229,7 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
     }
     ++td->counts->tx.tx_totals[mbmi->tx_size];
     ++td->counts->tx.tx_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
+    if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_update_sb_postencode(cpi, mbmi, mi_row, mi_col, bsize);
   }
 }
diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index 00e4c610..3c6a9283 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c
@@ -99,7 +99,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const int eob = p->eobs[block];
-  const PLANE_TYPE type = pd->plane_type;
+  const PLANE_TYPE type = get_plane_type(plane);
   const int default_eob = 16 << (tx_size << 1);
   const int mul = 1 + (tx_size == TX_32X32);
   const int16_t *dequant_ptr = pd->dequant;
@@ -789,7 +789,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
   if (tx_size == TX_4X4) {
-    tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
+    tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block);
     scan_order = &vp9_scan_orders[TX_4X4][tx_type];
     mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
   } else {
@@ -797,7 +797,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
     if (tx_size == TX_32X32) {
       scan_order = &vp9_default_scan_orders[TX_32X32];
     } else {
-      tx_type = get_tx_type(pd->plane_type, xd);
+      tx_type = get_tx_type(get_plane_type(plane), xd);
       scan_order = &vp9_scan_orders[tx_size][tx_type];
     }
   }
diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c
index 7848c93a..e7196634 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/libvpx/vp9/encoder/vp9_encodemv.c
@@ -16,6 +16,8 @@
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encodemv.h"
 
+#include "vpx_dsp/vpx_dsp_common.h"
+
 static struct vp9_token mv_joint_encodings[MV_JOINTS];
 static struct vp9_token mv_class_encodings[MV_CLASSES];
 static struct vp9_token mv_fp_encodings[MV_FP_SIZE];
@@ -216,8 +218,8 @@ void vp9_encode_mv(VP9_COMP* cpi, vpx_writer* w,
   // If auto_mv_step_size is enabled then keep track of the largest
   // motion vector component used.
   if (cpi->sf.mv.auto_mv_step_size) {
-    unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3;
-    cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude);
+    unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3;
+    cpi->max_mv_magnitude = VPXMAX(maxv, cpi->max_mv_magnitude);
   }
 }
 
diff --git a/libvpx/vp9/encoder/vp9_encoder.c b/libvpx/vp9/encoder/vp9_encoder.c
index 4654d63b..72eafec4 100644
--- a/libvpx/vp9/encoder/vp9_encoder.c
+++ b/libvpx/vp9/encoder/vp9_encoder.c
@@ -17,6 +17,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
 #if CONFIG_INTERNAL_STATS
 #include "vpx_dsp/ssim.h"
@@ -411,6 +412,8 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
 
   vpx_free_frame_buffer(&cpi->svc.empty_frame.img);
   memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
+
+  vp9_free_svc_cyclic_refresh(cpi);
 }
 
 static void save_coding_context(VP9_COMP *cpi) {
@@ -686,7 +689,7 @@ static int alloc_context_buffers_ext(VP9_COMP *cpi) {
   return 0;
 }
 
-void vp9_alloc_compressor_data(VP9_COMP *cpi) {
+static void alloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
 
   vp9_alloc_context_buffers(cm, cm->width, cm->height);
@@ -772,10 +775,11 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   cm->use_highbitdepth = oxcf->use_highbitdepth;
 #endif
   cm->color_space = oxcf->color_space;
+  cm->color_range = oxcf->color_range;
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
-  vp9_alloc_compressor_data(cpi);
+  alloc_compressor_data(cpi);
 
   cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode;
 
@@ -1452,11 +1456,14 @@ static void realloc_segmentation_maps(VP9_COMP *cpi) {
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  int last_w = cpi->oxcf.width;
+  int last_h = cpi->oxcf.height;
 
   if (cm->profile != oxcf->profile)
     cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
   cm->color_space = oxcf->color_space;
+  cm->color_range = oxcf->color_range;
 
   if (cm->profile <= PROFILE_1)
     assert(cm->bit_depth == VPX_BITS_8);
@@ -1490,8 +1497,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 
   // Under a configuration change, where maximum_buffer_size may change,
   // keep buffer level clipped to the maximum allowed buffer size.
-  rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
-  rc->buffer_level = MIN(rc->buffer_level, rc->maximum_buffer_size);
+  rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = VPXMIN(rc->buffer_level, rc->maximum_buffer_size);
 
   // Set up frame rate and related parameters rate control values.
   vp9_new_framerate(cpi, cpi->framerate);
@@ -1502,15 +1509,25 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 
   cm->interp_filter = cpi->sf.default_interp_filter;
 
-  cm->display_width = cpi->oxcf.width;
-  cm->display_height = cpi->oxcf.height;
-  cm->width = cpi->oxcf.width;
-  cm->height = cpi->oxcf.height;
+  if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
+    cm->render_width = cpi->oxcf.render_width;
+    cm->render_height = cpi->oxcf.render_height;
+  } else {
+    cm->render_width = cpi->oxcf.width;
+    cm->render_height = cpi->oxcf.height;
+  }
+  if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
+    cm->width = cpi->oxcf.width;
+    cm->height = cpi->oxcf.height;
+  }
 
   if (cpi->initial_width) {
-    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) {
+    int new_mi_size = 0;
+    vp9_set_mb_mi(cm, cm->width, cm->height);
+    new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+    if (cm->mi_alloc_size < new_mi_size) {
       vp9_free_context_buffers(cm);
-      vp9_alloc_compressor_data(cpi);
+      alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
     }
@@ -1918,14 +1935,15 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
   snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
 
 void vp9_remove_compressor(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
+  VP9_COMMON *cm;
   unsigned int i;
   int t;
 
   if (!cpi)
     return;
 
-  if (cpi && (cm->current_video_frame > 0)) {
+  cm = &cpi->common;
+  if (cm->current_video_frame > 0) {
 #if CONFIG_INTERNAL_STATS
     vpx_clear_system_state();
 
@@ -2247,42 +2265,6 @@ typedef struct {
   uint32_t samples[4];  // total/y/u/v
 } PSNR_STATS;
 
-static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
-                      PSNR_STATS *psnr) {
-  static const double peak = 255.0;
-  const int widths[3]        = {
-      a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
-  const int heights[3]       = {
-      a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
-  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
-  const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
-  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
-  const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
-  int i;
-  uint64_t total_sse = 0;
-  uint32_t total_samples = 0;
-
-  for (i = 0; i < 3; ++i) {
-    const int w = widths[i];
-    const int h = heights[i];
-    const uint32_t samples = w * h;
-    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
-                                 b_planes[i], b_strides[i],
-                                 w, h);
-    psnr->sse[1 + i] = sse;
-    psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
-    total_sse += sse;
-    total_samples += samples;
-  }
-
-  psnr->sse[0] = total_sse;
-  psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
-                                  (double)total_sse);
-}
-
 #if CONFIG_VP9_HIGHBITDEPTH
 static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                              const YV12_BUFFER_CONFIG *b,
@@ -2335,6 +2317,44 @@ static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
   psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
                                   (double)total_sse);
 }
+
+#else  // !CONFIG_VP9_HIGHBITDEPTH
+
+static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                      PSNR_STATS *psnr) {
+  static const double peak = 255.0;
+  const int widths[3]        = {
+      a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
+  const int heights[3]       = {
+      a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
+  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
+  const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
+  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
+  const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
+                                 b_planes[i], b_strides[i],
+                                 w, h);
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+                                  (double)total_sse);
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static void generate_psnr_packet(VP9_COMP *cpi) {
@@ -2615,7 +2635,7 @@ static int scale_down(VP9_COMP *cpi, int q) {
   if (rc->frame_size_selector == UNSCALED &&
       q >= rc->rf_level_maxq[gf_group->rf_level[gf_group->index]]) {
     const int max_size_thresh = (int)(rate_thresh_mult[SCALE_STEP1]
-        * MAX(rc->this_frame_target, rc->avg_frame_bandwidth));
+        * VPXMAX(rc->this_frame_target, rc->avg_frame_bandwidth));
     scale = rc->projected_frame_size > max_size_thresh ? 1 : 0;
   }
   return scale;
@@ -2736,7 +2756,8 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
                                    cpi->common.frame_type,
                                    cpi->refresh_alt_ref_frame,
                                    cpi->refresh_golden_frame,
-                                   cpi->refresh_last_frame);
+                                   cpi->refresh_last_frame,
+                                   cpi->resize_pending);
   }
 #endif
 }
@@ -2744,6 +2765,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
   struct loopfilter *lf = &cm->lf;
+
   if (xd->lossless) {
       lf->filter_level = 0;
   } else {
@@ -2760,6 +2782,8 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   }
 
   if (lf->filter_level > 0) {
+    vp9_build_mask_frame(cm, lf->filter_level, 0);
+
     if (cpi->num_workers > 1)
       vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
                                lf->filter_level, 0, 0,
@@ -2998,7 +3022,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
 
 static void set_mv_search_params(VP9_COMP *cpi) {
   const VP9_COMMON *const cm = &cpi->common;
-  const unsigned int max_mv_def = MIN(cm->width, cm->height);
+  const unsigned int max_mv_def = VPXMIN(cm->width, cm->height);
 
   // Default based on max resolution.
   cpi->mv_step_param = vp9_init_search_range(max_mv_def);
@@ -3013,8 +3037,8 @@ static void set_mv_search_params(VP9_COMP *cpi) {
         // Allow mv_steps to correspond to twice the max mv magnitude found
         // in the previous frame, capped by the default max_mv_magnitude based
         // on resolution.
-        cpi->mv_step_param =
-            vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+        cpi->mv_step_param = vp9_init_search_range(
+            VPXMIN(max_mv_def, 2 * cpi->max_mv_magnitude));
       }
       cpi->max_mv_magnitude = 0;
     }
@@ -3076,6 +3100,21 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q,
 #endif  // CONFIG_VP9_POSTPROC
 }
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      !cpi->denoiser.frame_buffer_initialized) {
+    vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
+                       cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       cm->use_highbitdepth,
+#endif
+                       VP9_ENC_BORDER_IN_PIXELS);
+  }
+}
+#endif
+
 static void init_motion_estimation(VP9_COMP *cpi) {
   int y_stride = cpi->scaled_source.y_stride;
 
@@ -3107,26 +3146,30 @@ static void set_frame_size(VP9_COMP *cpi) {
   if (oxcf->pass == 0 &&
       oxcf->rc_mode == VPX_CBR &&
       !cpi->use_svc &&
-      oxcf->resize_mode == RESIZE_DYNAMIC) {
-      if (cpi->resize_pending == 1) {
-        oxcf->scaled_frame_width =
-            (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den;
-        oxcf->scaled_frame_height =
-            (cm->height * cpi->resize_scale_num) /cpi->resize_scale_den;
-      } else if (cpi->resize_pending == -1) {
-        // Go back up to original size.
-        oxcf->scaled_frame_width = oxcf->width;
-        oxcf->scaled_frame_height = oxcf->height;
-      }
-      if (cpi->resize_pending != 0) {
-        // There has been a change in frame size.
-        vp9_set_size_literal(cpi,
-                             oxcf->scaled_frame_width,
-                             oxcf->scaled_frame_height);
-
-        // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
-        set_mv_search_params(cpi);
-      }
+      oxcf->resize_mode == RESIZE_DYNAMIC &&
+      cpi->resize_pending != 0) {
+    oxcf->scaled_frame_width =
+        (oxcf->width * cpi->resize_scale_num) / cpi->resize_scale_den;
+    oxcf->scaled_frame_height =
+        (oxcf->height * cpi->resize_scale_num) /cpi->resize_scale_den;
+    // There has been a change in frame size.
+    vp9_set_size_literal(cpi,
+                         oxcf->scaled_frame_width,
+                         oxcf->scaled_frame_height);
+
+    // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+    set_mv_search_params(cpi);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+    // Reset the denoiser on the resized frame.
+    if (cpi->oxcf.noise_sensitivity > 0) {
+      vp9_denoiser_free(&(cpi->denoiser));
+      setup_denoiser_buffer(cpi);
+      // Dynamic resize is only triggered for non-SVC, so we can force
+      // golden frame update here as temporary fix to denoiser.
+      cpi->refresh_golden_frame = 1;
+    }
+#endif
   }
 
   if ((oxcf->pass == 2) &&
@@ -3193,11 +3236,26 @@ static void encode_without_recode_loop(VP9_COMP *cpi,
 
   cpi->Source = vp9_scale_if_required(cm,
                                       cpi->un_scaled_source,
-                                      &cpi->scaled_source);
-  if (cpi->unscaled_last_source != NULL)
+                                      &cpi->scaled_source,
+                                      (cpi->oxcf.pass == 0));
+
+  // Avoid scaling last_source unless its needed.
+  // Last source is currently only used for screen-content mode,
+  // or if partition_search_type == SOURCE_VAR_BASED_PARTITION.
+  if (cpi->unscaled_last_source != NULL &&
+      (cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+      cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION))
     cpi->Last_Source = vp9_scale_if_required(cm,
                                              cpi->unscaled_last_source,
-                                             &cpi->scaled_last_source);
+                                             &cpi->scaled_last_source,
+                                             (cpi->oxcf.pass == 0));
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    vp9_denoiser_update_noise_estimate(cpi);
+  }
+#endif
 
   if (cpi->oxcf.pass == 0 &&
       cpi->oxcf.rc_mode == VPX_CBR &&
@@ -3270,6 +3328,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi,
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
       cm->frame_type != KEY_FRAME &&
       !cpi->use_svc &&
+      cpi->ext_refresh_frame_flags_pending == 0 &&
       (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR))
     vp9_cyclic_refresh_check_golden_update(cpi);
 
@@ -3328,11 +3387,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
     }
 
     cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
-                                      &cpi->scaled_source);
+                                      &cpi->scaled_source,
+                                      (cpi->oxcf.pass == 0));
 
     if (cpi->unscaled_last_source != NULL)
       cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
-                                               &cpi->scaled_last_source);
+                                               &cpi->scaled_last_source,
+                                               (cpi->oxcf.pass == 0));
 
     if (frame_is_intra_only(cm) == 0) {
       if (loop_count > 0) {
@@ -3414,7 +3475,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
 
           // Adjust Q
           q = (int)((q * high_err_target) / kf_err);
-          q = MIN(q, (q_high + q_low) >> 1);
+          q = VPXMIN(q, (q_high + q_low) >> 1);
         } else if (kf_err < low_err_target &&
                    rc->projected_frame_size >= frame_under_shoot_limit) {
           // The key frame is much better than the previous frame
@@ -3423,7 +3484,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
 
           // Adjust Q
           q = (int)((q * low_err_target) / kf_err);
-          q = MIN(q, (q_high + q_low + 1) >> 1);
+          q = VPXMIN(q, (q_high + q_low + 1) >> 1);
         }
 
         // Clamp Q to upper and lower limits:
@@ -3432,7 +3493,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
         loop = q != last_q;
       } else if (recode_loop_test(
           cpi, frame_over_shoot_limit, frame_under_shoot_limit,
-          q, MAX(q_high, top_index), bottom_index)) {
+          q, VPXMAX(q_high, top_index), bottom_index)) {
         // Is the projected frame size out of range and are we allowed
         // to attempt to recode.
         int last_q = q;
@@ -3474,12 +3535,12 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
             vp9_rc_update_rate_correction_factors(cpi);
 
             q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
-                                   bottom_index, MAX(q_high, top_index));
+                                  bottom_index, VPXMAX(q_high, top_index));
 
             while (q < q_low && retries < 10) {
               vp9_rc_update_rate_correction_factors(cpi);
               q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
-                                     bottom_index, MAX(q_high, top_index));
+                                    bottom_index, VPXMAX(q_high, top_index));
               retries++;
             }
           }
@@ -3578,26 +3639,22 @@ static void set_ext_overrides(VP9_COMP *cpi) {
     cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
     cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
     cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
-    cpi->ext_refresh_frame_flags_pending = 0;
   }
 }
 
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled) {
+                                          YV12_BUFFER_CONFIG *scaled,
+                                          int use_normative_scaler) {
   if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
       cm->mi_rows * MI_SIZE != unscaled->y_height) {
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (unscaled->y_width == (scaled->y_width << 1) &&
-        unscaled->y_height == (scaled->y_height << 1))
+    if (use_normative_scaler)
       scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth);
     else
       scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
 #else
-    // Use the faster normative (convolve8) scaling filter: for now only for
-    // scaling factor of 2.
-    if (unscaled->y_width == (scaled->y_width << 1) &&
-        unscaled->y_height == (scaled->y_height << 1))
+    if (use_normative_scaler)
       scale_and_extend_frame(unscaled, scaled);
     else
       scale_and_extend_frame_nonnormative(unscaled, scaled);
@@ -3747,6 +3804,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     if (vp9_rc_drop_frame(cpi)) {
       vp9_rc_postencode_update_drop_frame(cpi);
       ++cm->current_video_frame;
+      cpi->ext_refresh_frame_flags_pending = 0;
       return;
     }
   }
@@ -3799,6 +3857,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     cpi->refresh_last_frame = 1;
 
   cm->frame_to_show = get_frame_new_buffer(cm);
+  cm->frame_to_show->color_space = cm->color_space;
+  cm->frame_to_show->color_range = cm->color_range;
+  cm->frame_to_show->render_width  = cm->render_width;
+  cm->frame_to_show->render_height = cm->render_height;
 
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
@@ -3828,6 +3890,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     }
   }
 
+  cpi->ext_refresh_frame_flags_pending = 0;
+
   if (cpi->refresh_golden_frame == 1)
     cpi->frame_flags |= FRAMEFLAGS_GOLDEN;
   else
@@ -3953,21 +4017,6 @@ static void check_initial_width(VP9_COMP *cpi,
   }
 }
 
-#if CONFIG_VP9_TEMPORAL_DENOISING
-static void setup_denoiser_buffer(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  if (cpi->oxcf.noise_sensitivity > 0 &&
-      !cpi->denoiser.frame_buffer_initialized) {
-    vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
-                       cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                       cm->use_highbitdepth,
-#endif
-                       VP9_ENC_BORDER_IN_PIXELS);
-  }
-}
-#endif
-
 int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
@@ -4053,8 +4102,8 @@ static void adjust_frame_rate(VP9_COMP *cpi,
       // Average this frame's rate into the last second's average
       // frame rate. If we haven't seen 1 second yet, then average
       // over the whole interval seen.
-      const double interval = MIN((double)(source->ts_end
-                                   - cpi->first_time_stamp_ever), 10000000.0);
+      const double interval = VPXMIN(
+          (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
       double avg_duration = 10000000.0 / cpi->framerate;
       avg_duration *= (interval - avg_duration + this_duration);
       avg_duration /= interval;
@@ -4118,7 +4167,7 @@ static void adjust_image_stat(double y, double u, double v, double all,
   s->stat[U] += u;
   s->stat[V] += v;
   s->stat[ALL] += all;
-  s->worst = MIN(s->worst, all);
+  s->worst = VPXMIN(s->worst, all);
 }
 #endif  // CONFIG_INTERNAL_STATS
 
@@ -4237,7 +4286,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       // non-zero spatial layer, it should not be an intra picture.
       // TODO(Won Kap): this needs to change if per-layer intra frame is
       // allowed.
-      if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->svc.spatial_layer_id) {
+      if ((source->flags & VPX_EFLAG_FORCE_KF) &&
+          cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) {
         source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
       }
 
@@ -4448,7 +4498,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
           frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-          cpi->worst_ssim= MIN(cpi->worst_ssim, frame_ssim2);
+          cpi->worst_ssim = VPXMIN(cpi->worst_ssim, frame_ssim2);
           cpi->summed_quality += frame_ssim2 * weight;
           cpi->summed_weights += weight;
 
@@ -4485,7 +4535,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
               cpi->Source->y_buffer, cpi->Source->y_stride,
               cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
               cpi->Source->y_width, cpi->Source->y_height);
-          cpi->worst_blockiness = MAX(cpi->worst_blockiness, frame_blockiness);
+          cpi->worst_blockiness =
+              VPXMAX(cpi->worst_blockiness, frame_blockiness);
           cpi->total_blockiness += frame_blockiness;
         }
       }
@@ -4505,8 +4556,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
           double consistency = vpx_sse_to_psnr(samples, peak,
                                              (double)cpi->total_inconsistency);
           if (consistency > 0.0)
-            cpi->worst_consistency = MIN(cpi->worst_consistency,
-                                         consistency);
+            cpi->worst_consistency =
+                VPXMIN(cpi->worst_consistency, consistency);
           cpi->total_inconsistency += this_inconsistency;
         }
       }
@@ -4618,8 +4669,10 @@ int vp9_set_internal_size(VP9_COMP *cpi,
   // always go to the next whole number
   cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs;
   cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs;
-  assert(cm->width <= cpi->initial_width);
-  assert(cm->height <= cpi->initial_height);
+  if (cm->current_video_frame) {
+    assert(cm->width <= cpi->initial_width);
+    assert(cm->height <= cpi->initial_height);
+  }
 
   update_frame_size(cpi);
 
diff --git a/libvpx/vp9/encoder/vp9_encoder.h b/libvpx/vp9/encoder/vp9_encoder.h
index c10abd20..159c03aa 100644
--- a/libvpx/vp9/encoder/vp9_encoder.h
+++ b/libvpx/vp9/encoder/vp9_encoder.h
@@ -238,6 +238,9 @@ typedef struct VP9EncoderConfig {
   int use_highbitdepth;
 #endif
   vpx_color_space_t color_space;
+  vpx_color_range_t color_range;
+  int render_width;
+  int render_height;
   VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
 } VP9EncoderConfig;
 
@@ -605,8 +608,6 @@ int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
                              const YV12_BUFFER_CONFIG *b);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-void vp9_alloc_compressor_data(VP9_COMP *cpi);
-
 void vp9_scale_references(VP9_COMP *cpi);
 
 void vp9_update_reference_frames(VP9_COMP *cpi);
@@ -615,7 +616,8 @@ void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled);
+                                          YV12_BUFFER_CONFIG *scaled,
+                                          int use_normative_scaler);
 
 void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
 
diff --git a/libvpx/vp9/encoder/vp9_ethread.c b/libvpx/vp9/encoder/vp9_ethread.c
index 00025b7a..ad25712b 100644
--- a/libvpx/vp9/encoder/vp9_ethread.c
+++ b/libvpx/vp9/encoder/vp9_ethread.c
@@ -11,6 +11,7 @@
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_ethread.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
   int i, j, k, l, m, n;
@@ -67,7 +68,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-  const int num_workers = MIN(cpi->oxcf.max_threads, tile_cols);
+  const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
   int i;
 
   vp9_init_tile_data(cpi);
@@ -80,7 +81,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
     // resolution.
     if (cpi->use_svc) {
       int max_tile_cols = get_max_tile_cols(cpi);
-      allocated_workers = MIN(cpi->oxcf.max_threads, max_tile_cols);
+      allocated_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols);
     }
 
     CHECK_MEM_ERROR(cm, cpi->workers,
@@ -191,7 +192,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
 
     // Accumulate counters.
     if (i < cpi->num_workers - 1) {
-      vp9_accumulate_frame_counts(cm, thread_data->td->counts, 0);
+      vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);
       accumulate_rd_opt(&cpi->td, thread_data->td);
     }
   }
diff --git a/libvpx/vp9/encoder/vp9_ethread.h b/libvpx/vp9/encoder/vp9_ethread.h
index e87c50bc..1efa4dcd 100644
--- a/libvpx/vp9/encoder/vp9_ethread.h
+++ b/libvpx/vp9/encoder/vp9_ethread.h
@@ -11,6 +11,10 @@
 #ifndef VP9_ENCODER_VP9_ETHREAD_H_
 #define VP9_ENCODER_VP9_ETHREAD_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct VP9_COMP;
 struct ThreadData;
 
@@ -22,4 +26,8 @@ typedef struct EncWorkerData {
 
 void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_ENCODER_VP9_ETHREAD_H_
diff --git a/libvpx/vp9/encoder/vp9_extend.c b/libvpx/vp9/encoder/vp9_extend.c
index 0c304dc5..92585b82 100644
--- a/libvpx/vp9/encoder/vp9_extend.c
+++ b/libvpx/vp9/encoder/vp9_extend.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
@@ -111,10 +112,12 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   // Motion estimation may use src block variance with the block size up
   // to 64x64, so the right and bottom need to be extended to 64 multiple
   // or up to 16, whichever is greater.
-  const int er_y = MAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6))
-      - src->y_crop_width;
-  const int eb_y = MAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6))
-      - src->y_crop_height;
+  const int er_y =
+      VPXMAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+      src->y_crop_width;
+  const int eb_y =
+      VPXMAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+      src->y_crop_height;
   const int uv_width_subsampling = (src->uv_width != src->y_width);
   const int uv_height_subsampling = (src->uv_height != src->y_height);
   const int et_uv = et_y >> uv_height_subsampling;
diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index e0c5966e..30738b52 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c
@@ -15,6 +15,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
@@ -381,7 +382,7 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
 // for first pass test.
 static int get_search_range(const VP9_COMP *cpi) {
   int sr = 0;
-  const int dim = MIN(cpi->initial_width, cpi->initial_height);
+  const int dim = VPXMIN(cpi->initial_width, cpi->initial_height);
 
   while ((dim << sr) < MAX_FULL_PEL_VAL)
     ++sr;
@@ -596,7 +597,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
                  (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
 
     cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
-                                        &cpi->scaled_source);
+                                        &cpi->scaled_source, 0);
   }
 
   vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
@@ -1024,7 +1025,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   // Exclude any image dead zone
   if (image_data_start_row > 0) {
     intra_skip_count =
-      MAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
+        VPXMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
   }
 
   {
@@ -1161,7 +1162,7 @@ static double calc_correction_factor(double err_per_mb,
 
   // Adjustment based on actual quantizer to power term.
   const double power_term =
-      MIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
+      VPXMIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
 
   // Calculate correction factor.
   if (power_term < 1.0)
@@ -1182,19 +1183,22 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
                                      double group_weight_factor) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  // Clamp the target rate to VBR min / max limts.
+  const int target_rate =
+      vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth);
 
   inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
 
-  if (section_target_bandwidth <= 0) {
+  if (target_rate <= 0) {
     return rc->worst_quality;  // Highest value allowed
   } else {
     const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
                         ? cpi->initial_mbs : cpi->common.MBs;
-    const int active_mbs = MAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+    const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
     const double av_err_per_mb = section_err / active_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
     const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
-    const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
+    const int target_norm_bits_per_mb = ((uint64_t)target_rate <<
                                          BPER_MB_NORMBITS) / active_mbs;
 
     int q;
@@ -1223,7 +1227,7 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
 
     // Restriction on active max q for constrained quality mode.
     if (cpi->oxcf.rc_mode == VPX_CQ)
-      q = MAX(q, oxcf->cq_level);
+      q = VPXMAX(q, oxcf->cq_level);
     return q;
   }
 }
@@ -1233,11 +1237,11 @@ static void setup_rf_level_maxq(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
     int qdelta = vp9_frame_type_qdelta(cpi, i, rc->worst_quality);
-    rc->rf_level_maxq[i] = MAX(rc->worst_quality + qdelta, rc->best_quality);
+    rc->rf_level_maxq[i] = VPXMAX(rc->worst_quality + qdelta, rc->best_quality);
   }
 }
 
-void vp9_init_subsampling(VP9_COMP *cpi) {
+static void init_subsampling(VP9_COMP *cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   const int w = cm->width;
@@ -1332,7 +1336,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   twopass->last_kfgroup_zeromotion_pct = 100;
 
   if (oxcf->resize_mode != RESIZE_NONE) {
-    vp9_init_subsampling(cpi);
+    init_subsampling(cpi);
   }
 }
 
@@ -1364,12 +1368,12 @@ static double get_sr_decay_rate(const VP9_COMP *cpi,
 
 
   if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
-    sr_diff = MIN(sr_diff, SR_DIFF_MAX);
+    sr_diff = VPXMIN(sr_diff, SR_DIFF_MAX);
     sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
                (MOTION_AMP_PART * motion_amplitude_factor) -
                (INTRA_PART * modified_pcnt_intra);
   }
-  return MAX(sr_decay, MIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+  return VPXMAX(sr_decay, VPXMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
 }
 
 // This function gives an estimate of how badly we believe the prediction
@@ -1379,7 +1383,7 @@ static double get_zero_motion_factor(const VP9_COMP *cpi,
   const double zero_motion_pct = frame->pcnt_inter -
                                  frame->pcnt_motion;
   double sr_decay = get_sr_decay_rate(cpi, frame);
-  return MIN(sr_decay, zero_motion_pct);
+  return VPXMIN(sr_decay, zero_motion_pct);
 }
 
 #define ZM_POWER_FACTOR 0.75
@@ -1391,8 +1395,8 @@ static double get_prediction_decay_rate(const VP9_COMP *cpi,
     (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
                 ZM_POWER_FACTOR));
 
-  return MAX(zero_motion_factor,
-             (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+  return VPXMAX(zero_motion_factor,
+                (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
 }
 
 // Function to test for a condition where a complex transition is followed
@@ -1483,12 +1487,12 @@ static double calc_frame_boost(VP9_COMP *cpi,
   const double lq =
     vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
                             cpi->common.bit_depth);
-  const double boost_q_correction = MIN((0.5 + (lq * 0.015)), 1.5);
+  const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5);
   int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
                 ? cpi->initial_mbs : cpi->common.MBs;
 
   // Correct for any inactive region in the image
-  num_mbs = (int)MAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+  num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
 
   // Underlying boost factor is based on inter error ratio.
   frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
@@ -1504,7 +1508,7 @@ static double calc_frame_boost(VP9_COMP *cpi,
   else
     frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
 
-  return MIN(frame_boost, max_boost * boost_q_correction);
+  return VPXMIN(frame_boost, max_boost * boost_q_correction);
 }
 
 static int calc_arf_boost(VP9_COMP *cpi, int offset,
@@ -1593,7 +1597,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
   arf_boost = (*f_boost + *b_boost);
   if (arf_boost < ((b_frames + f_frames) * 20))
     arf_boost = ((b_frames + f_frames) * 20);
-  arf_boost = MAX(arf_boost, MIN_ARF_GF_BOOST);
+  arf_boost = VPXMAX(arf_boost, MIN_ARF_GF_BOOST);
 
   return arf_boost;
 }
@@ -1664,7 +1668,8 @@ static int calculate_boost_bits(int frame_count,
   }
 
   // Calculate the number of extra bits for use in the boosted frame or frames.
-  return MAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), 0);
+  return VPXMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+                0);
 }
 
 // Current limit on maximum number of active arfs in a GF/ARF group.
@@ -1803,7 +1808,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
     gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
 
     target_frame_size = clamp(target_frame_size, 0,
-                              MIN(max_bits, (int)total_group_bits));
+                              VPXMIN(max_bits, (int)total_group_bits));
 
     gf_group->update_type[frame_index] = LF_UPDATE;
     gf_group->rf_level[frame_index] = INTER_NORMAL;
@@ -1924,7 +1929,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     int int_lbq =
       (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex,
                                    cpi->common.bit_depth));
-    active_min_gf_interval = rc->min_gf_interval + MIN(2, int_max_q / 200);
+    active_min_gf_interval = rc->min_gf_interval + VPXMIN(2, int_max_q / 200);
     if (active_min_gf_interval > rc->max_gf_interval)
       active_min_gf_interval = rc->max_gf_interval;
 
@@ -1935,7 +1940,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       // bits to spare and are better with a smaller interval and smaller boost.
       // At high Q when there are few bits to spare we are better with a longer
       // interval to spread the cost of the GF.
-      active_max_gf_interval = 12 + MIN(4, (int_lbq / 6));
+      active_max_gf_interval = 12 + VPXMIN(4, (int_lbq / 6));
       if (active_max_gf_interval < active_min_gf_interval)
         active_max_gf_interval = active_min_gf_interval;
 
@@ -1980,8 +1985,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       decay_accumulator = decay_accumulator * loop_decay_rate;
 
       // Monitor for static sections.
-      zero_motion_accumulator =
-        MIN(zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
@@ -2037,7 +2042,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
       (zero_motion_accumulator < 0.995)) ? 1 : 0;
   } else {
-    rc->gfu_boost = MAX((int)boost_score, MIN_ARF_GF_BOOST);
+    rc->gfu_boost = VPXMAX((int)boost_score, MIN_ARF_GF_BOOST);
     rc->source_alt_ref_pending = 0;
   }
 
@@ -2092,11 +2097,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // rc factor is a weight factor that corrects for local rate control drift.
     double rc_factor = 1.0;
     if (rc->rate_error_estimate > 0) {
-      rc_factor = MAX(RC_FACTOR_MIN,
-                      (double)(100 - rc->rate_error_estimate) / 100.0);
+      rc_factor = VPXMAX(RC_FACTOR_MIN,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
     } else {
-      rc_factor = MIN(RC_FACTOR_MAX,
-                      (double)(100 - rc->rate_error_estimate) / 100.0);
+      rc_factor = VPXMIN(RC_FACTOR_MAX,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
     }
     tmp_q =
       get_twopass_worst_quality(cpi, group_av_err,
@@ -2104,7 +2109,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
                                 vbr_group_bits_per_frame,
                                 twopass->kfgroup_inter_fraction * rc_factor);
     twopass->active_worst_quality =
-      MAX(tmp_q, twopass->active_worst_quality >> 1);
+        VPXMAX(tmp_q, twopass->active_worst_quality >> 1);
   }
 #endif
 
@@ -2421,7 +2426,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   } else {
     twopass->kf_group_bits = 0;
   }
-  twopass->kf_group_bits = MAX(0, twopass->kf_group_bits);
+  twopass->kf_group_bits = VPXMAX(0, twopass->kf_group_bits);
 
   // Reset the first pass file position.
   reset_fpf_position(twopass, start_position);
@@ -2435,22 +2440,21 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       break;
 
     // Monitor for static sections.
-    zero_motion_accumulator =
-      MIN(zero_motion_accumulator,
-          get_zero_motion_factor(cpi, &next_frame));
+    zero_motion_accumulator = VPXMIN(
+        zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
 
     // Not all frames in the group are necessarily used in calculating boost.
     if ((i <= rc->max_gf_interval) ||
         ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
       const double frame_boost =
-        calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST);
+        calc_frame_boost(cpi, &next_frame, 0, KF_MAX_BOOST);
 
       // How fast is prediction quality decaying.
       if (!detect_flash(twopass, 0)) {
         const double loop_decay_rate =
           get_prediction_decay_rate(cpi, &next_frame);
         decay_accumulator *= loop_decay_rate;
-        decay_accumulator = MAX(decay_accumulator, MIN_DECAY_FACTOR);
+        decay_accumulator = VPXMAX(decay_accumulator, MIN_DECAY_FACTOR);
         av_decay_accumulator += decay_accumulator;
         ++loop_decay_counter;
       }
@@ -2471,8 +2475,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // Apply various clamps for min and max boost
   rc->kf_boost = (int)(av_decay_accumulator * boost_score);
-  rc->kf_boost = MAX(rc->kf_boost, (rc->frames_to_key * 3));
-  rc->kf_boost = MAX(rc->kf_boost, MIN_KF_BOOST);
+  rc->kf_boost = VPXMAX(rc->kf_boost, (rc->frames_to_key * 3));
+  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_BOOST);
 
   // Work out how many bits to allocate for the key frame itself.
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
@@ -2736,11 +2740,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   }
 
   target_rate = gf_group->bit_allocation[gf_group->index];
-  if (cpi->common.frame_type == KEY_FRAME)
-    target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
-  else
-    target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
-
   rc->base_frame_target = target_rate;
 
   {
@@ -2770,7 +2769,7 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   // is designed to prevent extreme behaviour at the end of a clip
   // or group of frames.
   rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
-  twopass->bits_left = MAX(twopass->bits_left - bits_used, 0);
+  twopass->bits_left = VPXMAX(twopass->bits_left - bits_used, 0);
 
   // Calculate the pct rc error.
   if (rc->total_actual_bits) {
@@ -2786,7 +2785,7 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
     twopass->kf_group_bits -= bits_used;
     twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
   }
-  twopass->kf_group_bits = MAX(twopass->kf_group_bits, 0);
+  twopass->kf_group_bits = VPXMAX(twopass->kf_group_bits, 0);
 
   // Increment the gf group index ready for the next frame.
   ++twopass->gf_group.index;
@@ -2836,18 +2835,18 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
         rc->vbr_bits_off_target_fast +=
           fast_extra_thresh - rc->projected_frame_size;
         rc->vbr_bits_off_target_fast =
-          MIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+          VPXMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
 
         // Fast adaptation of minQ if necessary to use up the extra bits.
         if (rc->avg_frame_bandwidth) {
           twopass->extend_minq_fast =
             (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
         }
-        twopass->extend_minq_fast = MIN(twopass->extend_minq_fast,
-                                        minq_adj_limit - twopass->extend_minq);
+        twopass->extend_minq_fast = VPXMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
       } else if (rc->vbr_bits_off_target_fast) {
-        twopass->extend_minq_fast = MIN(twopass->extend_minq_fast,
-                                        minq_adj_limit - twopass->extend_minq);
+        twopass->extend_minq_fast = VPXMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
       } else {
         twopass->extend_minq_fast = 0;
       }
diff --git a/libvpx/vp9/encoder/vp9_firstpass.h b/libvpx/vp9/encoder/vp9_firstpass.h
index 49f9da38..5875a7b9 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/libvpx/vp9/encoder/vp9_firstpass.h
@@ -153,8 +153,6 @@ void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
 // Post encode update of the rate control parameters for 2-pass
 void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
 
-void vp9_init_subsampling(struct VP9_COMP *cpi);
-
 void calculate_coded_size(struct VP9_COMP *cpi,
                           int *scaled_frame_width,
                           int *scaled_frame_height);
diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c
index d59f3157..41b6d195 100644
--- a/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -13,6 +13,7 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/system_state.h"
 #include "vp9/encoder/vp9_segmentation.h"
@@ -29,7 +30,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
                                               int mb_col) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const SEARCH_METHODS old_search_method = mv_sf->search_method;
   const vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
 
   const int tmp_col_min = x->mv_col_min;
@@ -41,17 +43,18 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
 
   // Further step/diamond searches as necessary
   int step_param = mv_sf->reduce_first_step_size;
-  step_param = MIN(step_param, MAX_MVSEARCH_STEPS - 2);
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
 
   vp9_set_mv_search_range(x, ref_mv);
 
   ref_full.col = ref_mv->col >> 3;
   ref_full.row = ref_mv->row >> 3;
 
-  /*cpi->sf.search_method == HEX*/
-  vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
-                 cond_cost_list(cpi, cost_list),
-                 &v_fn_ptr, 0, ref_mv, dst_mv);
+  mv_sf->search_method = HEX;
+  vp9_full_pixel_search(cpi, x, BLOCK_16X16, &ref_full, step_param,
+                        x->errorperbit, cond_cost_list(cpi, cost_list), ref_mv,
+                        dst_mv, 0, 0);
+  mv_sf->search_method = old_search_method;
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c
index aa3e51ce..be8f57f7 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libvpx/vp9/encoder/vp9_mcomp.c
@@ -15,6 +15,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
@@ -37,10 +38,10 @@ void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv) {
   int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
   int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
 
-  col_min = MAX(col_min, (MV_LOW >> 3) + 1);
-  row_min = MAX(row_min, (MV_LOW >> 3) + 1);
-  col_max = MIN(col_max, (MV_UPP >> 3) - 1);
-  row_max = MIN(row_max, (MV_UPP >> 3) - 1);
+  col_min = VPXMAX(col_min, (MV_LOW >> 3) + 1);
+  row_min = VPXMAX(row_min, (MV_LOW >> 3) + 1);
+  col_max = VPXMIN(col_max, (MV_UPP >> 3) - 1);
+  row_max = VPXMIN(row_max, (MV_UPP >> 3) - 1);
 
   // Get intersection of UMV window and valid MV window to reduce # of checks
   // in diamond search.
@@ -57,12 +58,12 @@ void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv) {
 int vp9_init_search_range(int size) {
   int sr = 0;
   // Minimum search size no matter what the passed in value.
-  size = MAX(16, size);
+  size = VPXMAX(16, size);
 
   while ((size << sr) < MAX_FULL_PEL_VAL)
     sr++;
 
-  sr = MIN(sr, MAX_MVSEARCH_STEPS - 2);
+  sr = VPXMIN(sr, MAX_MVSEARCH_STEPS - 2);
   return sr;
 }
 
@@ -297,10 +298,10 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
   int br = bestmv->row * 8;                                                \
   int bc = bestmv->col * 8;                                                \
   int hstep = 4;                                                           \
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);           \
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);           \
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);           \
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);           \
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);        \
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);        \
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);        \
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);        \
   int tr = br;                                                             \
   int tc = bc;                                                             \
                                                                            \
@@ -668,10 +669,10 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
   int bc = bestmv->col * 8;
   int hstep = 4;
   int iter, round = 3 - forced_stop;
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
   int tr = br;
   int tc = bc;
   const MV *search_step = search_step_table;
@@ -1371,15 +1372,15 @@ int vp9_get_mvpred_av_var(const MACROBLOCK *x,
                                  x->mvcost, x->errorperbit) : 0);
 }
 
-int vp9_hex_search(const MACROBLOCK *x,
-                   MV *ref_mv,
-                   int search_param,
-                   int sad_per_bit,
-                   int do_init_search,
-                   int *cost_list,
-                   const vp9_variance_fn_ptr_t *vfp,
-                   int use_mvcost,
-                   const MV *center_mv, MV *best_mv) {
+static int hex_search(const MACROBLOCK *x,
+                      MV *ref_mv,
+                      int search_param,
+                      int sad_per_bit,
+                      int do_init_search,
+                      int *cost_list,
+                      const vp9_variance_fn_ptr_t *vfp,
+                      int use_mvcost,
+                      const MV *center_mv, MV *best_mv) {
   // First scale has 8-closest points, the rest have 6 points in hex shape
   // at increasing scales
   static const int hex_num_candidates[MAX_PATTERN_SCALES] = {
@@ -1406,16 +1407,16 @@ int vp9_hex_search(const MACROBLOCK *x,
                             hex_num_candidates, hex_candidates);
 }
 
-int vp9_bigdia_search(const MACROBLOCK *x,
-                      MV *ref_mv,
-                      int search_param,
-                      int sad_per_bit,
-                      int do_init_search,
-                      int *cost_list,
-                      const vp9_variance_fn_ptr_t *vfp,
-                      int use_mvcost,
-                      const MV *center_mv,
-                      MV *best_mv) {
+static int bigdia_search(const MACROBLOCK *x,
+                         MV *ref_mv,
+                         int search_param,
+                         int sad_per_bit,
+                         int do_init_search,
+                         int *cost_list,
+                         const vp9_variance_fn_ptr_t *vfp,
+                         int use_mvcost,
+                         const MV *center_mv,
+                         MV *best_mv) {
   // First scale has 4-closest points, the rest have 8 points in diamond
   // shape at increasing scales
   static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
@@ -1448,16 +1449,16 @@ int vp9_bigdia_search(const MACROBLOCK *x,
                                 bigdia_num_candidates, bigdia_candidates);
 }
 
-int vp9_square_search(const MACROBLOCK *x,
-                      MV *ref_mv,
-                      int search_param,
-                      int sad_per_bit,
-                      int do_init_search,
-                      int *cost_list,
-                      const vp9_variance_fn_ptr_t *vfp,
-                      int use_mvcost,
-                      const MV *center_mv,
-                      MV *best_mv) {
+static int square_search(const MACROBLOCK *x,
+                         MV *ref_mv,
+                         int search_param,
+                         int sad_per_bit,
+                         int do_init_search,
+                         int *cost_list,
+                         const vp9_variance_fn_ptr_t *vfp,
+                         int use_mvcost,
+                         const MV *center_mv,
+                         MV *best_mv) {
   // All scales have 8 closest points in square shape
   static const int square_num_candidates[MAX_PATTERN_SCALES] = {
     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
@@ -1490,34 +1491,34 @@ int vp9_square_search(const MACROBLOCK *x,
                             square_num_candidates, square_candidates);
 }
 
-int vp9_fast_hex_search(const MACROBLOCK *x,
-                        MV *ref_mv,
-                        int search_param,
-                        int sad_per_bit,
-                        int do_init_search,  // must be zero for fast_hex
-                        int *cost_list,
-                        const vp9_variance_fn_ptr_t *vfp,
-                        int use_mvcost,
-                        const MV *center_mv,
-                        MV *best_mv) {
-  return vp9_hex_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                        sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
-                        center_mv, best_mv);
+static int fast_hex_search(const MACROBLOCK *x,
+                           MV *ref_mv,
+                           int search_param,
+                           int sad_per_bit,
+                           int do_init_search,  // must be zero for fast_hex
+                           int *cost_list,
+                           const vp9_variance_fn_ptr_t *vfp,
+                           int use_mvcost,
+                           const MV *center_mv,
+                           MV *best_mv) {
+  return hex_search(x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                    sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
+                    center_mv, best_mv);
 }
 
-int vp9_fast_dia_search(const MACROBLOCK *x,
-                        MV *ref_mv,
-                        int search_param,
-                        int sad_per_bit,
-                        int do_init_search,
-                        int *cost_list,
-                        const vp9_variance_fn_ptr_t *vfp,
-                        int use_mvcost,
-                        const MV *center_mv,
-                        MV *best_mv) {
-  return vp9_bigdia_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                           sad_per_bit, do_init_search, cost_list, vfp,
-                           use_mvcost, center_mv, best_mv);
+static int fast_dia_search(const MACROBLOCK *x,
+                           MV *ref_mv,
+                           int search_param,
+                           int sad_per_bit,
+                           int do_init_search,
+                           int *cost_list,
+                           const vp9_variance_fn_ptr_t *vfp,
+                           int use_mvcost,
+                           const MV *center_mv,
+                           MV *best_mv) {
+  return bigdia_search(
+      x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param), sad_per_bit,
+      do_init_search, cost_list, vfp, use_mvcost, center_mv, best_mv);
 }
 
 #undef CHECK_BETTER
@@ -1547,10 +1548,10 @@ int vp9_full_range_search_c(const MACROBLOCK *x,
   best_sad = fn_ptr->sdf(what->buf, what->stride,
                          get_buf_from_mv(in_what, ref_mv), in_what->stride) +
                  mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  start_row = MAX(-range, x->mv_row_min - ref_mv->row);
-  start_col = MAX(-range, x->mv_col_min - ref_mv->col);
-  end_row = MIN(range, x->mv_row_max - ref_mv->row);
-  end_col = MIN(range, x->mv_col_max - ref_mv->col);
+  start_row = VPXMAX(-range, x->mv_row_min - ref_mv->row);
+  start_col = VPXMAX(-range, x->mv_col_min - ref_mv->col);
+  end_row = VPXMIN(range, x->mv_row_max - ref_mv->row);
+  end_col = VPXMIN(range, x->mv_col_max - ref_mv->col);
 
   for (r = start_row; r <= end_row; ++r) {
     for (c = start_col; c <= end_col; c += 4) {
@@ -1946,15 +1947,16 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   return best_sad;
 }
 
+// Runs sequence of diamond searches in smaller steps for RD.
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
-int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
-                           MV *mvp_full, int step_param,
-                           int sadpb, int further_steps, int do_refine,
-                           int *cost_list,
-                           const vp9_variance_fn_ptr_t *fn_ptr,
-                           const MV *ref_mv, MV *dst_mv) {
+static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
+                              MV *mvp_full, int step_param,
+                              int sadpb, int further_steps, int do_refine,
+                              int *cost_list,
+                              const vp9_variance_fn_ptr_t *fn_ptr,
+                              const MV *ref_mv, MV *dst_mv) {
   MV temp_mv;
   int thissme, n, num00 = 0;
   int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
@@ -2021,10 +2023,10 @@ int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
-  const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
-  const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
-  const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
+  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   int best_sad = fn_ptr->sdf(what->buf, what->stride,
       get_buf_from_mv(in_what, ref_mv), in_what->stride) +
@@ -2054,10 +2056,10 @@ int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
-  const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
-  const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
-  const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
+  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
       get_buf_from_mv(in_what, ref_mv), in_what->stride) +
@@ -2119,10 +2121,10 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
-  const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
-  const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
-  const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
+  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
       get_buf_from_mv(in_what, ref_mv), in_what->stride) +
@@ -2346,29 +2348,29 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
 
   switch (method) {
     case FAST_DIAMOND:
-      var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
-                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case FAST_HEX:
-      var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
-                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      var = fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case HEX:
-      var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
-                           cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      var = hex_search(x, mvp_full, step_param, error_per_bit, 1,
+                       cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case SQUARE:
-      var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
-                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      var = square_search(x, mvp_full, step_param, error_per_bit, 1,
+                          cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case BIGDIA:
-      var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
-                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
+                          cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case NSTEP:
-      var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
-                                   MAX_MVSEARCH_STEPS - 1 - step_param,
-                                   1, cost_list, fn_ptr, ref_mv, tmp_mv);
+      var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+                               MAX_MVSEARCH_STEPS - 1 - step_param,
+                               1, cost_list, fn_ptr, ref_mv, tmp_mv);
       break;
     default:
       assert(0 && "Invalid search method.");
diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h
index 817bd795..5efd5435 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/libvpx/vp9/encoder/vp9_mcomp.h
@@ -72,38 +72,12 @@ int vp9_refining_search_sad(const struct macroblock *x,
                             const struct vp9_variance_vtable *fn_ptr,
                             const struct mv *center_mv);
 
-// Runs sequence of diamond searches in smaller steps for RD.
-int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
-                           MV *mvp_full, int step_param,
-                           int sadpb, int further_steps, int do_refine,
-                           int *cost_list,
-                           const vp9_variance_fn_ptr_t *fn_ptr,
-                           const MV *ref_mv, MV *dst_mv);
-
 // Perform integral projection based motion estimation.
 unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
                                            MACROBLOCK *x,
                                            BLOCK_SIZE bsize,
                                            int mi_row, int mi_col);
 
-typedef int (integer_mv_pattern_search_fn) (
-    const MACROBLOCK *x,
-    MV *ref_mv,
-    int search_param,
-    int error_per_bit,
-    int do_init_search,
-    int *cost_list,
-    const vp9_variance_fn_ptr_t *vf,
-    int use_mvcost,
-    const MV *center_mv,
-    MV *best_mv);
-
-integer_mv_pattern_search_fn vp9_hex_search;
-integer_mv_pattern_search_fn vp9_bigdia_search;
-integer_mv_pattern_search_fn vp9_square_search;
-integer_mv_pattern_search_fn vp9_fast_hex_search;
-integer_mv_pattern_search_fn vp9_fast_dia_search;
-
 typedef int (fractional_mv_step_fp) (
     const MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c
index 8e191038..5444bc89 100644
--- a/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/libvpx/vp9/encoder/vp9_picklpf.c
@@ -40,6 +40,8 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
   VP9_COMMON *const cm = &cpi->common;
   int64_t filt_err;
 
+  vp9_build_mask_frame(cm, filt_level, partial_frame);
+
   if (cpi->num_workers > 1)
     vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
                              filt_level, 1, partial_frame,
@@ -92,8 +94,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
   ss_err[filt_mid] = best_err;
 
   while (filter_step > 0) {
-    const int filt_high = MIN(filt_mid + filter_step, max_filter_level);
-    const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
+    const int filt_high = VPXMIN(filt_mid + filter_step, max_filter_level);
+    const int filt_low = VPXMAX(filt_mid - filter_step, min_filter_level);
 
     // Bias against raising loop filter in favor of lowering it.
     int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
diff --git a/libvpx/vp9/encoder/vp9_pickmode.c b/libvpx/vp9/encoder/vp9_pickmode.c
index cc018fcb..fc4d9ae6 100644
--- a/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/libvpx/vp9/encoder/vp9_pickmode.c
@@ -16,6 +16,7 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
@@ -293,8 +294,8 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
 
   if (cpi->common.tx_mode == TX_MODE_SELECT) {
     if (sse > (var << 2))
-      tx_size = MIN(max_txsize_lookup[bsize],
-                    tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+      tx_size = VPXMIN(max_txsize_lookup[bsize],
+                       tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
     else
       tx_size = TX_8X8;
 
@@ -304,8 +305,8 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
     else if (tx_size > TX_16X16)
       tx_size = TX_16X16;
   } else {
-    tx_size = MIN(max_txsize_lookup[bsize],
-                  tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    tx_size = VPXMIN(max_txsize_lookup[bsize],
+                     tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
   }
 
   assert(tx_size >= TX_8X8);
@@ -475,8 +476,8 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
   if (cpi->common.tx_mode == TX_MODE_SELECT) {
     if (sse > (var << 2))
       xd->mi[0]->mbmi.tx_size =
-          MIN(max_txsize_lookup[bsize],
-              tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+          VPXMIN(max_txsize_lookup[bsize],
+                 tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
     else
       xd->mi[0]->mbmi.tx_size = TX_8X8;
 
@@ -487,8 +488,8 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
       xd->mi[0]->mbmi.tx_size = TX_16X16;
   } else {
     xd->mi[0]->mbmi.tx_size =
-        MIN(max_txsize_lookup[bsize],
-            tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+        VPXMIN(max_txsize_lookup[bsize],
+               tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
   }
 
   // Evaluate if the partition block is a skippable block in Y plane.
@@ -687,10 +688,11 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
 }
 #endif
 
-static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
+static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
                                MACROBLOCK *x, MACROBLOCKD *xd,
                                int *out_rate_sum, int64_t *out_dist_sum,
-                               unsigned int *var_y, unsigned int *sse_y) {
+                               unsigned int *var_y, unsigned int *sse_y,
+                               int start_plane, int stop_plane) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -702,12 +704,12 @@ static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
   *out_rate_sum = 0;
   *out_dist_sum = 0;
 
-  for (i = 1; i <= 2; ++i) {
+  for (i = start_plane; i <= stop_plane; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
     const uint32_t dc_quant = pd->dequant[0];
     const uint32_t ac_quant = pd->dequant[1];
-    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+    const BLOCK_SIZE bs = plane_bsize;
     unsigned int var;
 
     if (!x->color_sensitivity[i - 1])
@@ -791,7 +793,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
     const unsigned int max_thresh = 36000;
     // The encode_breakout input
     const unsigned int min_thresh =
-        MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+        VPXMIN(((unsigned int)x->encode_breakout << 4), max_thresh);
 #if CONFIG_VP9_HIGHBITDEPTH
     const int shift = (xd->bd << 1) - 16;
 #endif
@@ -892,12 +894,8 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   int i, j;
   int rate;
   int64_t dist;
-  int64_t this_sse = INT64_MAX;
-  int is_skippable;
 
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  assert(plane == 0);
-  (void) plane;
 
   p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
   pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
@@ -907,13 +905,22 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                           x->skip_encode ? p->src.buf : pd->dst.buf,
                           x->skip_encode ? src_stride : dst_stride,
                           pd->dst.buf, dst_stride,
-                          i, j, 0);
+                          i, j, plane);
 
-  // TODO(jingning): This needs further refactoring.
-  block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
-            bsize_tx, MIN(tx_size, TX_16X16));
-  x->skip_txfm[0] = is_skippable;
-  rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), is_skippable);
+  if (plane == 0) {
+    int64_t this_sse = INT64_MAX;
+    int is_skippable;
+    // TODO(jingning): This needs further refactoring.
+    block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
+              bsize_tx, VPXMIN(tx_size, TX_16X16));
+    x->skip_txfm[0] = is_skippable;
+    // TODO(jingning): Skip is signalled per prediciton block not per tx block.
+    rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), is_skippable);
+  } else {
+    unsigned int var, sse;
+    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &rate, &dist, &var, &sse,
+                       plane, plane);
+  }
 
   p->src.buf = src_buf_base;
   pd->dst.buf = dst_buf_base;
@@ -961,8 +968,8 @@ static INLINE void update_thresh_freq_fact(VP9_COMP *cpi,
   if (thr_mode_idx == best_mode_idx)
     *freq_fact -= (*freq_fact >> 4);
   else
-    *freq_fact = MIN(*freq_fact + RD_THRESH_INC,
-        cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+    *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC,
+                        cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
 }
 
 void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
@@ -973,8 +980,8 @@ void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
   PREDICTION_MODE this_mode;
   struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
   const TX_SIZE intra_tx_size =
-      MIN(max_txsize_lookup[bsize],
-          tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+      VPXMIN(max_txsize_lookup[bsize],
+             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
   MODE_INFO *const mic = xd->mi[0];
   int *bmode_costs;
   const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
@@ -1160,8 +1167,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   mbmi->sb_type = bsize;
   mbmi->ref_frame[0] = NONE;
   mbmi->ref_frame[1] = NONE;
-  mbmi->tx_size = MIN(max_txsize_lookup[bsize],
-                      tx_mode_to_biggest_tx_size[cm->tx_mode]);
+  mbmi->tx_size = VPXMIN(max_txsize_lookup[bsize],
+                         tx_mode_to_biggest_tx_size[cm->tx_mode]);
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
   vp9_denoiser_reset_frame_stats(ctx);
@@ -1231,10 +1238,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (const_motion[ref_frame] && this_mode == NEARMV)
       continue;
 
-    i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-    if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking)
-      if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
-        ref_frame_skip_mask |= (1 << ref_frame);
+    if (!(this_mode == ZEROMV && ref_frame == LAST_FRAME)) {
+      i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+      if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking)
+        if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+          ref_frame_skip_mask |= (1 << ref_frame);
+    }
     if (ref_frame_skip_mask & (1 << ref_frame))
       continue;
 
@@ -1414,7 +1423,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (!this_early_term) {
       this_sse = (int64_t)sse_y;
       block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
-                &this_sse, 0, bsize, MIN(mbmi->tx_size, TX_16X16));
+                &this_sse, 0, bsize, VPXMIN(mbmi->tx_size, TX_16X16));
       x->skip_txfm[0] = is_skippable;
       if (is_skippable) {
         this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
@@ -1442,12 +1451,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
       int uv_rate = 0;
       int64_t uv_dist = 0;
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
       if (x->color_sensitivity[0])
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
       if (x->color_sensitivity[1])
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
-      model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist,
-                         &var_y, &sse_y);
+      model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &uv_rate, &uv_dist,
+                         &var_y, &sse_y, 1, 2);
       this_rdc.rate += uv_rate;
       this_rdc.dist += uv_dist;
     }
@@ -1522,11 +1532,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
        bsize <= cpi->sf.max_intra_bsize)) {
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
-    const TX_SIZE intra_tx_size =
-        MIN(max_txsize_lookup[bsize],
-            tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
     int i;
     TX_SIZE best_intra_tx_size = TX_SIZES;
+    TX_SIZE intra_tx_size =
+        VPXMIN(max_txsize_lookup[bsize],
+               tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && intra_tx_size > TX_16X16)
+      intra_tx_size = TX_16X16;
 
     if (reuse_inter_pred && best_pred != NULL) {
       if (best_pred->data == orig_dst.buf) {
@@ -1570,6 +1582,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       mbmi->tx_size = intra_tx_size;
       vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
                                              estimate_block_intra, &args);
+      // Inter and intra RD will mismatch in scale for non-screen content.
+      if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
+        if (x->color_sensitivity[0])
+          vp9_foreach_transformed_block_in_plane(xd, bsize, 1,
+                                                 estimate_block_intra, &args);
+        if (x->color_sensitivity[1])
+          vp9_foreach_transformed_block_in_plane(xd, bsize, 2,
+                                                 estimate_block_intra, &args);
+      }
       this_rdc.rate = args.rate;
       this_rdc.dist = args.dist;
       this_rdc.rate += cpi->mbmode_cost[this_mode];
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c
index 4ba34067..d7006857 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -15,6 +15,7 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
@@ -106,8 +107,8 @@ static int kf_low = 400;
 static int get_minq_index(double maxq, double x3, double x2, double x1,
                           vpx_bit_depth_t bit_depth) {
   int i;
-  const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq,
-                                maxq);
+  const double minqtarget = VPXMIN(((x3 * maxq + x2) * maxq + x1) * maxq,
+                                   maxq);
 
   // Special case handling to deal with the step from q2.0
   // down to lossless mode represented by q 1.0.
@@ -192,15 +193,15 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
                            vpx_bit_depth_t bit_depth) {
   const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor,
                                            bit_depth));
-  return MAX(FRAME_OVERHEAD_BITS,
-             (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+  return VPXMAX(FRAME_OVERHEAD_BITS,
+                (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
 }
 
 int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
   const RATE_CONTROL *rc = &cpi->rc;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
-  const int min_frame_target = MAX(rc->min_frame_bandwidth,
-                                   rc->avg_frame_bandwidth >> 5);
+  const int min_frame_target = VPXMAX(rc->min_frame_bandwidth,
+                                      rc->avg_frame_bandwidth >> 5);
   if (target < min_frame_target)
     target = min_frame_target;
   if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
@@ -216,7 +217,7 @@ int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
   if (oxcf->rc_max_inter_bitrate_pct) {
     const int max_rate = rc->avg_frame_bandwidth *
                          oxcf->rc_max_inter_bitrate_pct / 100;
-    target = MIN(target, max_rate);
+    target = VPXMIN(target, max_rate);
   }
   return target;
 }
@@ -227,7 +228,7 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) {
   if (oxcf->rc_max_intra_bitrate_pct) {
     const int max_rate = rc->avg_frame_bandwidth *
                              oxcf->rc_max_intra_bitrate_pct / 100;
-    target = MIN(target, max_rate);
+    target = VPXMIN(target, max_rate);
   }
   if (target > rc->max_frame_bandwidth)
     target = rc->max_frame_bandwidth;
@@ -250,7 +251,8 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
     lrc->bits_off_target += bits_off_for_this_layer;
 
     // Clip buffer level to maximum buffer size for the layer.
-    lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+    lrc->bits_off_target =
+        VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
     lrc->buffer_level = lrc->bits_off_target;
   }
 }
@@ -268,7 +270,14 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
   }
 
   // Clip the buffer level to the maximum specified buffer size.
-  rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
+
+  // For screen-content mode, and if frame-dropper is off, don't let buffer
+  // level go below threshold, given here as -rc->maximum_ buffer_size.
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+      cpi->oxcf.drop_frames_water_mark == 0)
+    rc->bits_off_target = VPXMAX(rc->bits_off_target, -rc->maximum_buffer_size);
+
   rc->buffer_level = rc->bits_off_target;
 
   if (is_one_pass_cbr_svc(cpi)) {
@@ -287,8 +296,8 @@ int vp9_rc_get_default_min_gf_interval(
   if (factor <= factor_safe)
     return default_interval;
   else
-    return MAX(default_interval,
-               (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+    return VPXMAX(default_interval,
+                  (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
   // Note this logic makes:
   // 4K24: 5
   // 4K30: 6
@@ -296,9 +305,9 @@ int vp9_rc_get_default_min_gf_interval(
 }
 
 int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
-  int interval = MIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+  int interval = VPXMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
   interval += (interval & 0x01);  // Round to even value
-  return MAX(interval, min_gf_interval);
+  return VPXMAX(interval, min_gf_interval);
 }
 
 void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
@@ -478,7 +487,7 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
   // More heavily damped adjustment used if we have been oscillating either side
   // of target.
   adjustment_limit = 0.25 +
-      0.5 * MIN(1, fabs(log10(0.01 * correction_factor)));
+      0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor)));
 
   cpi->rc.q_2_frame = cpi->rc.q_1_frame;
   cpi->rc.q_1_frame = cm->base_qindex;
@@ -531,8 +540,7 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
   do {
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
         cm->seg.enabled &&
-        cpi->svc.temporal_layer_id == 0 &&
-        cpi->svc.spatial_layer_id == 0) {
+        cpi->svc.temporal_layer_id == 0) {
       bits_per_mb_at_this_q =
           (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
     } else {
@@ -558,8 +566,8 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
   if (cpi->oxcf.rc_mode == VPX_CBR &&
       (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
       cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
-    q = clamp(q, MIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
-              MAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+    q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+              VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
   }
   return q;
 }
@@ -617,7 +625,7 @@ static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) {
                                              : rc->last_q[INTER_FRAME] * 2;
     }
   }
-  return MIN(active_worst_quality, rc->worst_quality);
+  return VPXMIN(active_worst_quality, rc->worst_quality);
 }
 
 // Adjust active_worst_quality level based on buffer level.
@@ -644,10 +652,10 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
   // So for first few frames following key, the qp of that key frame is weighted
   // into the active_worst_quality setting.
   ambient_qp = (cm->current_video_frame < num_frames_weight_key) ?
-      MIN(rc->avg_frame_qindex[INTER_FRAME], rc->avg_frame_qindex[KEY_FRAME]) :
-      rc->avg_frame_qindex[INTER_FRAME];
-  active_worst_quality = MIN(rc->worst_quality,
-                             ambient_qp * 5 / 4);
+                   VPXMIN(rc->avg_frame_qindex[INTER_FRAME],
+                          rc->avg_frame_qindex[KEY_FRAME]) :
+                   rc->avg_frame_qindex[INTER_FRAME];
+  active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 / 4);
   if (rc->buffer_level > rc->optimal_buffer_level) {
     // Adjust down.
     // Maximum limit for down adjustment, ~30%.
@@ -700,7 +708,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
       int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
                                             (last_boosted_q * 0.75),
                                             cm->bit_depth);
-      active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else if (cm->current_video_frame > 0) {
       // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
@@ -833,7 +841,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
       int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
                                             last_boosted_q * 0.75,
                                             cm->bit_depth);
-      active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
       // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
@@ -1002,21 +1010,21 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
       int qindex;
 
       if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
-        qindex = MIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+        qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
         active_best_quality = qindex;
         last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
         delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
                                               last_boosted_q * 1.25,
                                               cm->bit_depth);
-        active_worst_quality = MIN(qindex + delta_qindex, active_worst_quality);
-
+        active_worst_quality =
+            VPXMIN(qindex + delta_qindex, active_worst_quality);
       } else {
         qindex = rc->last_boosted_qindex;
         last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
         delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
                                               last_boosted_q * 0.75,
                                               cm->bit_depth);
-        active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+        active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
       }
     } else {
       // Not forced keyframe.
@@ -1116,8 +1124,8 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
       (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
     int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
                                        active_worst_quality);
-    active_worst_quality = MAX(active_worst_quality + qdelta,
-                               active_best_quality);
+    active_worst_quality = VPXMAX(active_worst_quality + qdelta,
+                                  active_best_quality);
   }
 #endif
 
@@ -1126,7 +1134,8 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
     int qdelta = vp9_compute_qdelta_by_rate(rc, cm->frame_type,
                                             active_best_quality, 2.0,
                                             cm->bit_depth);
-    active_best_quality = MAX(active_best_quality + qdelta, rc->best_quality);
+    active_best_quality =
+        VPXMAX(active_best_quality + qdelta, rc->best_quality);
   }
 
   active_best_quality = clamp(active_best_quality,
@@ -1141,7 +1150,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
              rc->this_key_frame_forced) {
     // If static since last kf use better of last boosted and last kf q.
     if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
-      q = MIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+      q = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
     } else {
       q = rc->last_boosted_qindex;
     }
@@ -1203,9 +1212,9 @@ void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi,
     // For very small rate targets where the fractional adjustment
     // may be tiny make sure there is at least a minimum range.
     const int tolerance = (cpi->sf.recode_tolerance * frame_target) / 100;
-    *frame_under_shoot_limit = MAX(frame_target - tolerance - 200, 0);
-    *frame_over_shoot_limit = MIN(frame_target + tolerance + 200,
-                                  cpi->rc.max_frame_bandwidth);
+    *frame_under_shoot_limit = VPXMAX(frame_target - tolerance - 200, 0);
+    *frame_over_shoot_limit = VPXMIN(frame_target + tolerance + 200,
+                                     cpi->rc.max_frame_bandwidth);
   }
 }
 
@@ -1351,7 +1360,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
 
   rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
 
-  if (!cpi->use_svc) {
+  if (!cpi->use_svc || is_two_pass_svc(cpi)) {
     if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
         (cm->frame_type != KEY_FRAME))
       // Update the alternate reference frame stats as appropriate.
@@ -1458,7 +1467,8 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   const SVC *const svc = &cpi->svc;
   const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
   const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
-  int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+  int min_frame_target =
+      VPXMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
   int target;
 
   if (oxcf->gf_cbr_boost_pct) {
@@ -1480,23 +1490,24 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
             svc->temporal_layer_id, svc->number_temporal_layers);
     const LAYER_CONTEXT *lc = &svc->layer_context[layer];
     target = lc->avg_frame_size;
-    min_frame_target = MAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
+    min_frame_target = VPXMAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
   }
   if (diff > 0) {
     // Lower the target bandwidth for this frame.
-    const int pct_low = (int)MIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    const int pct_low = (int)VPXMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
     target -= (target * pct_low) / 200;
   } else if (diff < 0) {
     // Increase the target bandwidth for this frame.
-    const int pct_high = (int)MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+    const int pct_high =
+        (int)VPXMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
     target += (target * pct_high) / 200;
   }
   if (oxcf->rc_max_inter_bitrate_pct) {
     const int max_rate = rc->avg_frame_bandwidth *
                          oxcf->rc_max_inter_bitrate_pct / 100;
-    target = MIN(target, max_rate);
+    target = VPXMIN(target, max_rate);
   }
-  return MAX(min_frame_target, target);
+  return VPXMAX(min_frame_target, target);
 }
 
 static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
@@ -1518,7 +1529,7 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
       const LAYER_CONTEXT *lc = &svc->layer_context[layer];
       framerate = lc->framerate;
     }
-    kf_boost = MAX(kf_boost, (int)(2 * framerate - 16));
+    kf_boost = VPXMAX(kf_boost, (int)(2 * framerate - 16));
     if (rc->frames_since_key <  framerate / 2) {
       kf_boost = (int)(kf_boost * rc->frames_since_key /
                        (framerate / 2));
@@ -1584,7 +1595,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
       cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
     } else if (is_one_pass_cbr_svc(cpi)) {
       LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
-      if (cpi->svc.spatial_layer_id == 0) {
+      if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode) {
         lc->is_key_frame = 0;
       } else {
         lc->is_key_frame =
@@ -1726,7 +1737,7 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
     rc->max_gf_interval = rc->static_scene_max_gf_interval;
 
   // Clamp min to max
-  rc->min_gf_interval = MIN(rc->min_gf_interval, rc->max_gf_interval);
+  rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
 }
 
 void vp9_rc_update_framerate(VP9_COMP *cpi) {
@@ -1739,7 +1750,8 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
   rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth *
                                 oxcf->two_pass_vbrmin_section / 100);
 
-  rc->min_frame_bandwidth = MAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+  rc->min_frame_bandwidth =
+      VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
 
   // A maximum bitrate for a frame is defined.
   // The baseline for this aligns with HW implementations that
@@ -1750,8 +1762,8 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
   // specifies lossless encode.
   vbr_max_bits = (int)(((int64_t)rc->avg_frame_bandwidth *
                      oxcf->two_pass_vbrmax_section) / 100);
-  rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P),
-                                    vbr_max_bits);
+  rc->max_frame_bandwidth =
+      VPXMAX(VPXMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
 
   vp9_rc_set_gf_interval_range(cpi, rc);
 }
@@ -1789,12 +1801,12 @@ static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) {
   // Dont do it for kf,arf,gf or overlay frames.
   if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
       rc->vbr_bits_off_target_fast) {
-    int one_frame_bits = MAX(rc->avg_frame_bandwidth, *this_frame_target);
+    int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target);
     int fast_extra_bits;
-    fast_extra_bits =
-      (int)MIN(rc->vbr_bits_off_target_fast, one_frame_bits);
-    fast_extra_bits = (int)MIN(fast_extra_bits,
-      MAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
+    fast_extra_bits = (int)VPXMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits = (int)VPXMIN(
+        fast_extra_bits,
+        VPXMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
     *this_frame_target += (int)fast_extra_bits;
     rc->vbr_bits_off_target_fast -= fast_extra_bits;
   }
@@ -1804,6 +1816,11 @@ void vp9_set_target_rate(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   int target_rate = rc->base_frame_target;
 
+  if (cpi->common.frame_type == KEY_FRAME)
+    target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
+  else
+    target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
+
   // Correction to rate target based on prior over or under shoot.
   if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
     vbr_rate_correction(cpi, &target_rate);
@@ -1815,7 +1832,9 @@ void vp9_set_target_rate(VP9_COMP *cpi) {
 int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
-  int resize_now = 0;
+  RESIZE_ACTION resize_action = NO_RESIZE;
+  int avg_qp_thr1 = 70;
+  int avg_qp_thr2 = 50;
   cpi->resize_scale_num = 1;
   cpi->resize_scale_den = 1;
   // Don't resize on key frame; reset the counters on key frame.
@@ -1824,10 +1843,19 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
     cpi->resize_count = 0;
     return 0;
   }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  // If denoiser is on, apply a smaller qp threshold.
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    avg_qp_thr1 = 60;
+    avg_qp_thr2 = 40;
+  }
+#endif
+
   // Resize based on average buffer underflow and QP over some window.
   // Ignore samples close to key frame, since QP is usually high after key.
-  if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
-    const int window = (int)(5 * cpi->framerate);
+  if (cpi->rc.frames_since_key > 1 * cpi->framerate) {
+    const int window = (int)(4 * cpi->framerate);
     cpi->resize_avg_qp += cm->base_qindex;
     if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
       ++cpi->resize_buffer_underflow;
@@ -1835,18 +1863,30 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
     // Check for resize action every "window" frames.
     if (cpi->resize_count >= window) {
       int avg_qp = cpi->resize_avg_qp / cpi->resize_count;
-      // Resize down if buffer level has underflowed sufficent amount in past
-      // window, and we are at original resolution.
+      // Resize down if buffer level has underflowed sufficient amount in past
+      // window, and we are at original or 3/4 of original resolution.
       // Resize back up if average QP is low, and we are currently in a resized
-      // down state.
-      if (cpi->resize_state == 0 &&
-          cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) {
-        resize_now = 1;
-        cpi->resize_state = 1;
-      } else if (cpi->resize_state == 1 &&
-                 avg_qp < 40 * cpi->rc.worst_quality / 100) {
-        resize_now = -1;
-        cpi->resize_state = 0;
+      // down state, i.e. 1/2 or 3/4 of original resolution.
+      // Currently, use a flag to turn 3/4 resizing feature on/off.
+      if (cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) {
+        if (cpi->resize_state == THREE_QUARTER) {
+          resize_action = DOWN_ONEHALF;
+          cpi->resize_state = ONE_HALF;
+        } else if (cpi->resize_state == ORIG) {
+          resize_action = ONEHALFONLY_RESIZE ? DOWN_ONEHALF : DOWN_THREEFOUR;
+          cpi->resize_state = ONEHALFONLY_RESIZE ? ONE_HALF : THREE_QUARTER;
+        }
+      } else if (cpi->resize_state != ORIG &&
+                 avg_qp < avg_qp_thr1 * cpi->rc.worst_quality / 100) {
+        if (cpi->resize_state == THREE_QUARTER ||
+            avg_qp < avg_qp_thr2 * cpi->rc.worst_quality / 100 ||
+            ONEHALFONLY_RESIZE) {
+          resize_action = UP_ORIG;
+          cpi->resize_state = ORIG;
+        } else if (cpi->resize_state == ONE_HALF) {
+          resize_action = UP_THREEFOUR;
+          cpi->resize_state = THREE_QUARTER;
+        }
       }
       // Reset for next window measurement.
       cpi->resize_avg_qp = 0;
@@ -1856,26 +1896,30 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
   }
   // If decision is to resize, reset some quantities, and check is we should
   // reduce rate correction factor,
-  if (resize_now != 0) {
+  if (resize_action != NO_RESIZE) {
     int target_bits_per_frame;
     int active_worst_quality;
     int qindex;
     int tot_scale_change;
-    // For now, resize is by 1/2 x 1/2.
-    cpi->resize_scale_num = 1;
-    cpi->resize_scale_den = 2;
+    if (resize_action == DOWN_THREEFOUR || resize_action == UP_THREEFOUR) {
+      cpi->resize_scale_num = 3;
+      cpi->resize_scale_den = 4;
+    } else if (resize_action == DOWN_ONEHALF) {
+      cpi->resize_scale_num = 1;
+      cpi->resize_scale_den = 2;
+    } else {  // UP_ORIG or anything else
+      cpi->resize_scale_num = 1;
+      cpi->resize_scale_den = 1;
+    }
     tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) /
         (cpi->resize_scale_num * cpi->resize_scale_num);
     // Reset buffer level to optimal, update target size.
     rc->buffer_level = rc->optimal_buffer_level;
     rc->bits_off_target = rc->optimal_buffer_level;
     rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi);
-    // Reset cyclic refresh parameters.
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
-      vp9_cyclic_refresh_reset_resize(cpi);
     // Get the projected qindex, based on the scaled target frame size (scaled
     // so target_bits_per_mb in vp9_rc_regulate_q will be correct target).
-    target_bits_per_frame = (resize_now == 1) ?
+    target_bits_per_frame = (resize_action >= 0) ?
         rc->this_frame_target * tot_scale_change :
         rc->this_frame_target / tot_scale_change;
     active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
@@ -1886,19 +1930,19 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
     // If resize is down, check if projected q index is close to worst_quality,
     // and if so, reduce the rate correction factor (since likely can afford
     // lower q for resized frame).
-    if (resize_now == 1 &&
+    if (resize_action > 0 &&
         qindex > 90 * cpi->rc.worst_quality / 100) {
       rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
     }
     // If resize is back up, check if projected q index is too much above the
     // current base_qindex, and if so, reduce the rate correction factor
     // (since prefer to keep q for resized frame at least close to previous q).
-    if (resize_now == -1 &&
+    if (resize_action < 0 &&
        qindex > 130 * cm->base_qindex / 100) {
       rc->rate_correction_factors[INTER_NORMAL] *= 0.9;
     }
   }
-  return resize_now;
+  return resize_action;
 }
 
 // Compute average source sad (temporal sad: between current source and
@@ -1948,7 +1992,7 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
     // between current and the previous frame value(s). Use a minimum threshold
     // for cases where there is small change from content that is completely
     // static.
-    if (avg_sad > MAX(4000, (rc->avg_source_sad << 3)) &&
+    if (avg_sad > VPXMAX(4000, (rc->avg_source_sad << 3)) &&
         rc->frames_since_key > 1)
       rc->high_source_sad = 1;
     else
@@ -1968,16 +2012,59 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi,
   int thresh_rate = rc->avg_frame_bandwidth * 10;
   if (cm->base_qindex < thresh_qp &&
       frame_size > thresh_rate) {
+    double rate_correction_factor =
+        cpi->rc.rate_correction_factors[INTER_NORMAL];
+    const int target_size = cpi->rc.avg_frame_bandwidth;
+    double new_correction_factor;
+    int target_bits_per_mb;
+    double q2;
+    int enumerator;
     // Force a re-encode, and for now use max-QP.
     *q = cpi->rc.worst_quality;
-    // Adjust avg_frame_qindex and buffer_level, as these parameters will affect
-    // QP selection for subsequent frames. If they have settled down to a very
-    // different (low QP) state, then not re-adjusting them may cause next
-    // frame to select low QP and overshoot again.
-    // TODO(marpan): Check if rate correction factor should also be adjusted.
+    // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as
+    // these parameters will affect QP selection for subsequent frames. If they
+    // have settled down to a very different (low QP) state, then not adjusting
+    // them may cause next frame to select low QP and overshoot again.
     cpi->rc.avg_frame_qindex[INTER_FRAME] = *q;
     rc->buffer_level = rc->optimal_buffer_level;
     rc->bits_off_target = rc->optimal_buffer_level;
+    // Reset rate under/over-shoot flags.
+    cpi->rc.rc_1_frame = 0;
+    cpi->rc.rc_2_frame = 0;
+    // Adjust rate correction factor.
+    target_bits_per_mb = ((uint64_t)target_size << BPER_MB_NORMBITS) / cm->MBs;
+    // Rate correction factor based on target_bits_per_mb and qp (==max_QP).
+    // This comes from the inverse computation of vp9_rc_bits_per_mb().
+    q2 = vp9_convert_qindex_to_q(*q, cm->bit_depth);
+    enumerator = 1800000;  // Factor for inter frame.
+    enumerator += (int)(enumerator * q2) >> 12;
+    new_correction_factor = (double)target_bits_per_mb * q2 / enumerator;
+    if (new_correction_factor > rate_correction_factor) {
+      rate_correction_factor =
+          VPXMIN(2.0 * rate_correction_factor, new_correction_factor);
+      if (rate_correction_factor > MAX_BPB_FACTOR)
+        rate_correction_factor = MAX_BPB_FACTOR;
+      cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+    }
+    // For temporal layers, reset the rate control parametes across all
+    // temporal layers.
+    if (cpi->use_svc) {
+      int i = 0;
+      SVC *svc = &cpi->svc;
+      for (i = 0; i < svc->number_temporal_layers; ++i) {
+        const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
+                                           svc->number_temporal_layers);
+        LAYER_CONTEXT *lc = &svc->layer_context[layer];
+        RATE_CONTROL *lrc = &lc->rc;
+        lrc->avg_frame_qindex[INTER_FRAME] = *q;
+        lrc->buffer_level = rc->optimal_buffer_level;
+        lrc->bits_off_target = rc->optimal_buffer_level;
+        lrc->rc_1_frame = 0;
+        lrc->rc_2_frame = 0;
+        lrc->rate_correction_factors[INTER_NORMAL] =
+            rate_correction_factor;
+      }
+    }
     return 1;
   } else {
     return 0;
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.h b/libvpx/vp9/encoder/vp9_ratectrl.h
index 11dfa35c..136fd3e7 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -26,6 +26,7 @@ extern "C" {
 
 #define MIN_GF_INTERVAL     4
 #define MAX_GF_INTERVAL     16
+#define ONEHALFONLY_RESIZE  0
 
 typedef enum {
   INTER_NORMAL = 0,
@@ -43,6 +44,20 @@ typedef enum {
   FRAME_SCALE_STEPS
 } FRAME_SCALE_LEVEL;
 
+typedef enum {
+  NO_RESIZE = 0,
+  DOWN_THREEFOUR = 1,  // From orig to 3/4.
+  DOWN_ONEHALF = 2,    // From orig or 3/4 to 1/2.
+  UP_THREEFOUR = -1,   // From 1/2 to 3/4.
+  UP_ORIG = -2,        // From 1/2 or 3/4 to orig.
+} RESIZE_ACTION;
+
+typedef enum {
+  ORIG = 0,
+  THREE_QUARTER = 1,
+  ONE_HALF = 2
+} RESIZE_STATE;
+
 // Frame dimensions multiplier wrt the native frame size, in 1/16ths,
 // specified for the scale-up case.
 // e.g. 24 => 16/24 = 2/3 of native size. The restriction to 1/16th is
diff --git a/libvpx/vp9/encoder/vp9_rd.c b/libvpx/vp9/encoder/vp9_rd.c
index 2f2f7c1b..b085c7a0 100644
--- a/libvpx/vp9/encoder/vp9_rd.c
+++ b/libvpx/vp9/encoder/vp9_rd.c
@@ -14,6 +14,7 @@
 
 #include "./vp9_rtcd.h"
 
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/bitops.h"
 #include "vpx_ports/mem.h"
@@ -172,7 +173,7 @@ int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
-    const int boost_index = MIN(15, (cpi->rc.gfu_boost / 100));
+    const int boost_index = VPXMIN(15, (cpi->rc.gfu_boost / 100));
 
     rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
     rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
@@ -204,7 +205,7 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
   q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   // TODO(debargha): Adjust the function below.
-  return MAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
+  return VPXMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
 }
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) {
@@ -404,7 +405,7 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
     static const uint32_t MAX_XSQ_Q10 = 245727;
     const uint64_t xsq_q10_64 =
         (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
-    const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10);
+    const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
     model_rd_norm(xsq_q10, &r_q10, &d_q10);
     *rate = ((r_q10 << n_log2) + 2) >> 2;
     *dist = (var * (int64_t)d_q10 + 512) >> 10;
@@ -485,7 +486,7 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
       continue;
     fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
     fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
-    max_mv = MAX(max_mv, MAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+    max_mv = VPXMAX(max_mv, VPXMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
 
     if (fp_row ==0 && fp_col == 0 && zero_seen)
       continue;
@@ -629,16 +630,15 @@ void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
     const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
     int mode;
     for (mode = 0; mode < top_mode; ++mode) {
-      const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4);
-      const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64);
+      const BLOCK_SIZE min_size = VPXMAX(bsize - 1, BLOCK_4X4);
+      const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_64X64);
       BLOCK_SIZE bs;
       for (bs = min_size; bs <= max_size; ++bs) {
         int *const fact = &factor_buf[bs][mode];
         if (mode == best_mode_index) {
           *fact -= (*fact >> 4);
         } else {
-          *fact = MIN(*fact + RD_THRESH_INC,
-                      rd_thresh * RD_THRESH_MAX_FACT);
+          *fact = VPXMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
         }
       }
     }
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index 96c64744..4f3a06e9 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -14,6 +14,7 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
@@ -192,8 +193,8 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
     const int64_t ac_thr = p->quant_thred[1] >> shift;
     // The low thresholds are used to measure if the prediction errors are
     // low enough so that we can skip the mode search.
-    const int64_t low_dc_thr = MIN(50, dc_thr >> 2);
-    const int64_t low_ac_thr = MIN(80, ac_thr >> 2);
+    const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2);
+    const int64_t low_ac_thr = VPXMIN(80, ac_thr >> 2);
     int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
     int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
     int idx, idy;
@@ -268,57 +269,79 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
   *out_dist_sum = dist_sum << 4;
 }
 
-int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
-                          intptr_t block_size, int64_t *ssz) {
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
+                                 const tran_low_t *dqcoeff,
+                                 intptr_t block_size,
+                                 int64_t *ssz, int bd) {
   int i;
   int64_t error = 0, sqcoeff = 0;
+  int shift = 2 * (bd - 8);
+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
 
   for (i = 0; i < block_size; i++) {
-    const int diff = coeff[i] - dqcoeff[i];
+    const int64_t diff = coeff[i] - dqcoeff[i];
     error +=  diff * diff;
-    sqcoeff += coeff[i] * coeff[i];
+    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
   }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
 
   *ssz = sqcoeff;
   return error;
 }
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
-                             int block_size) {
+int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
+                                      const tran_low_t *dqcoeff,
+                                      intptr_t block_size,
+                                      int64_t *ssz) {
+  // Note that the C versions of these 2 functions (vp9_block_error and
+  // vp9_highbd_block_error_8bit are the same, but the optimized assembly
+  // routines are not compatible in the non high bitdepth configuration, so
+  // they still cannot share the same name.
+  return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);
+}
+
+static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
+                                               const tran_low_t *dqcoeff,
+                                               intptr_t block_size,
+                                               int64_t *ssz, int bd) {
+  if (bd == 8) {
+    return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz);
+  } else {
+    return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                          intptr_t block_size, int64_t *ssz) {
   int i;
-  int64_t error = 0;
+  int64_t error = 0, sqcoeff = 0;
 
   for (i = 0; i < block_size; i++) {
     const int diff = coeff[i] - dqcoeff[i];
     error +=  diff * diff;
+    sqcoeff += coeff[i] * coeff[i];
   }
 
+  *ssz = sqcoeff;
   return error;
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
-                                 const tran_low_t *dqcoeff,
-                                 intptr_t block_size,
-                                 int64_t *ssz, int bd) {
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+                             int block_size) {
   int i;
-  int64_t error = 0, sqcoeff = 0;
-  int shift = 2 * (bd - 8);
-  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+  int64_t error = 0;
 
   for (i = 0; i < block_size; i++) {
-    const int64_t diff = coeff[i] - dqcoeff[i];
+    const int diff = coeff[i] - dqcoeff[i];
     error +=  diff * diff;
-    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
   }
-  assert(error >= 0 && sqcoeff >= 0);
-  error = (error + rounding) >> shift;
-  sqcoeff = (sqcoeff + rounding) >> shift;
 
-  *ssz = sqcoeff;
   return error;
 }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
  * decide whether to include cost of a trailing EOB node or not (i.e. we
@@ -340,8 +363,7 @@ static int cost_coeffs(MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const struct macroblock_plane *p = &x->plane[plane];
-  const struct macroblockd_plane *pd = &xd->plane[plane];
-  const PLANE_TYPE type = pd->plane_type;
+  const PLANE_TYPE type = get_plane_type(plane);
   const int16_t *band_count = &band_counts[tx_size][1];
   const int eob = p->eobs[block];
   const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
@@ -357,8 +379,8 @@ static int cost_coeffs(MACROBLOCK *x,
 #endif
 
   // Check for consistency of tx_size with mode info
-  assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
-                              : get_uv_tx_size(mbmi, pd) == tx_size);
+  assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size :
+         get_uv_tx_size(mbmi, &xd->plane[plane]) == tx_size);
 
   if (eob == 0) {
     // single eob token
@@ -430,8 +452,9 @@ static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 #if CONFIG_VP9_HIGHBITDEPTH
   const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
-  *out_dist = vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                                     &this_sse, bd) >> shift;
+  *out_dist = vp9_highbd_block_error_dispatch(coeff, dqcoeff,
+                                              16 << ss_txfrm_size,
+                                              &this_sse, bd) >> shift;
 #else
   *out_dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
                               &this_sse) >> shift;
@@ -505,7 +528,7 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
         if (tx_size != TX_32X32)
           dc_correct >>= 2;
 
-        dist = MAX(0, sse - dc_correct);
+        dist = VPXMAX(0, sse - dc_correct);
       }
     } else {
       // SKIP_TXFM_AC_DC
@@ -531,7 +554,7 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);
 
   // TODO(jingning): temporarily enabled only for luma component
-  rd = MIN(rd1, rd2);
+  rd = VPXMIN(rd1, rd2);
   if (plane == 0)
     x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
                                     (rd1 > rd2 && !xd->lossless);
@@ -569,7 +592,7 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
 
   vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
-  args.so = get_scan(xd, tx_size, pd->plane_type, 0);
+  args.so = get_scan(xd, tx_size, get_plane_type(plane), 0);
 
   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
                                          block_rd_txfm, &args);
@@ -597,7 +620,7 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
-  mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
+  mbmi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
 
   txfm_rd_in_plane(x, rate, distortion, skip,
                    sse, ref_best_rd, 0, bs,
@@ -637,8 +660,8 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
     start_tx = max_tx_size;
     end_tx = 0;
   } else {
-    TX_SIZE chosen_tx_size = MIN(max_tx_size,
-                                 tx_mode_to_biggest_tx_size[cm->tx_mode]);
+    TX_SIZE chosen_tx_size = VPXMIN(max_tx_size,
+                                    tx_mode_to_biggest_tx_size[cm->tx_mode]);
     start_tx = chosen_tx_size;
     end_tx = chosen_tx_size;
   }
@@ -663,6 +686,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
     } else if (s[n]) {
       if (is_inter_block(mbmi)) {
         rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
+        r[n][1] -= r_tx_size;
       } else {
         rd[n][0] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
         rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size, sse[n]);
@@ -672,6 +696,11 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
 
+    if (is_inter_block(mbmi) && !xd->lossless && !s[n] && sse[n] != INT64_MAX) {
+      rd[n][0] = VPXMIN(rd[n][0], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
+      rd[n][1] = VPXMIN(rd[n][1], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
+    }
+
     // Early termination in transform size search.
     if (cpi->sf.tx_size_search_breakout &&
         (rd[n][1] == INT64_MAX ||
@@ -825,7 +854,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x,
             ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
                                  so->scan, so->neighbors,
                                  cpi->sf.use_fast_coef_costing);
-            distortion += vp9_highbd_block_error(
+            distortion += vp9_highbd_block_error_dispatch(
                 coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                 16, &unused, xd->bd) >> 2;
             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
@@ -923,8 +952,13 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x,
           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
                              so->scan, so->neighbors,
                              cpi->sf.use_fast_coef_costing);
+#if CONFIG_VP9_HIGHBITDEPTH
+          distortion += vp9_highbd_block_error_8bit(
+              coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >> 2;
+#else
           distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                         16, &unused) >> 2;
+#endif
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
             goto next;
           vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
@@ -1362,6 +1396,9 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   k = i;
   for (idy = 0; idy < height / 4; ++idy) {
     for (idx = 0; idx < width / 4; ++idx) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+#endif
       int64_t ssz, rd, rd1, rd2;
       tran_low_t* coeff;
 
@@ -1371,14 +1408,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                     coeff, 8);
       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        thisdistortion += vp9_highbd_block_error(coeff,
-                                                 BLOCK_OFFSET(pd->dqcoeff, k),
-                                                 16, &ssz, xd->bd);
-      } else {
-        thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
-                                          16, &ssz);
-      }
+      thisdistortion += vp9_highbd_block_error_dispatch(
+          coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd);
 #else
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
@@ -1389,7 +1420,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                               cpi->sf.use_fast_coef_costing);
       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
-      rd = MIN(rd1, rd2);
+      rd = VPXMIN(rd1, rd2);
       if (rd >= best_yrd)
         return INT64_MAX;
     }
@@ -1808,7 +1839,8 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
           if (i == 0)
             max_mv = x->max_mv_context[mbmi->ref_frame[0]];
           else
-            max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
+            max_mv =
+                VPXMAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
 
           if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
             // Take wtd average of the step_params based on the last frame's
@@ -1826,7 +1858,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
           if (cpi->sf.adaptive_motion_search) {
             mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
             mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
-            step_param = MAX(step_param, 8);
+            step_param = VPXMAX(step_param, 8);
           }
 
           // adjust src pointer for this block
@@ -2231,7 +2263,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   vp9_set_mv_search_range(x, &ref_mv);
 
   // Work out the size of the first step in the mv step search.
-  // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+  // 0 here is maximum length first step. 1 is VPXMAX >> 1 etc.
   if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
     // Take wtd average of the step_params based on the last frame's
     // max mv magnitude and that based on the best ref mvs of the current
@@ -2243,9 +2275,10 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
-    int boffset = 2 * (b_width_log2_lookup[BLOCK_64X64] -
-          MIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
-    step_param = MAX(step_param, boffset);
+    int boffset =
+        2 * (b_width_log2_lookup[BLOCK_64X64] -
+             VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+    step_param = VPXMAX(step_param, boffset);
   }
 
   if (cpi->sf.adaptive_motion_search) {
@@ -2466,7 +2499,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       // motion field, where the distortion gain for a single block may not
       // be enough to overcome the cost of a new mv.
       if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
-        *rate2 += MAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+        *rate2 += VPXMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
       } else {
         *rate2 += rate_mv;
       }
@@ -2502,10 +2535,10 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // initiation of a motion field.
   if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]],
                           mode_mv, refs[0])) {
-    *rate2 += MIN(cost_mv_ref(cpi, this_mode,
-                              mbmi_ext->mode_context[refs[0]]),
-                  cost_mv_ref(cpi, NEARESTMV,
-                              mbmi_ext->mode_context[refs[0]]));
+    *rate2 += VPXMIN(cost_mv_ref(cpi, this_mode,
+                                 mbmi_ext->mode_context[refs[0]]),
+                     cost_mv_ref(cpi, NEARESTMV,
+                                 mbmi_ext->mode_context[refs[0]]));
   } else {
     *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]);
   }
@@ -2547,10 +2580,10 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
           filter_cache[i] = rd;
           filter_cache[SWITCHABLE_FILTERS] =
-              MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+              VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
           if (cm->interp_filter == SWITCHABLE)
             rd += rs_rd;
-          *mask_filter = MAX(*mask_filter, rd);
+          *mask_filter = VPXMAX(*mask_filter, rd);
         } else {
           int rate_sum = 0;
           int64_t dist_sum = 0;
@@ -2580,10 +2613,10 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
           filter_cache[i] = rd;
           filter_cache[SWITCHABLE_FILTERS] =
-              MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+              VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
           if (cm->interp_filter == SWITCHABLE)
             rd += rs_rd;
-          *mask_filter = MAX(*mask_filter, rd);
+          *mask_filter = VPXMAX(*mask_filter, rd);
 
           if (i == 0 && intpel_mv) {
             tmp_rate_sum = rate_sum;
@@ -2694,7 +2727,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     *distortion += distortion_y;
 
     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
-    rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
+    rdcosty = VPXMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
 
     if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
                           &sseuv, bsize, ref_best_rd - rdcosty)) {
@@ -2759,7 +2792,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                        pd[1].subsampling_x,
                                        pd[1].subsampling_y);
   rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
-                          &dist_uv, &uv_skip, MAX(BLOCK_8X8, bsize),
+                          &dist_uv, &uv_skip, VPXMAX(BLOCK_8X8, bsize),
                           max_uv_tx_size);
 
   if (y_skip && uv_skip) {
@@ -2826,12 +2859,12 @@ static void rd_variance_adjustment(VP9_COMP *cpi,
   // to a predictor with a low spatial complexity compared to the source.
   if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) &&
       (source_variance > recon_variance)) {
-    var_factor = MIN(absvar_diff, MIN(VLOW_ADJ_MAX, var_error));
+    var_factor = VPXMIN(absvar_diff, VPXMIN(VLOW_ADJ_MAX, var_error));
   // A second possible case of interest is where the source variance
   // is very low and we wish to discourage false texture or motion trails.
   } else if ((source_variance < (LOW_VAR_THRESH >> 1)) &&
              (recon_variance > source_variance)) {
-    var_factor = MIN(absvar_diff, MIN(VHIGH_ADJ_MAX, var_error));
+    var_factor = VPXMIN(absvar_diff, VPXMIN(VHIGH_ADJ_MAX, var_error));
   }
   *this_rd += (*this_rd * var_factor) / 100;
 }
@@ -2861,7 +2894,7 @@ int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) {
     top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
 
     bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
-    bottom_edge = MAX(top_edge, bottom_edge);
+    bottom_edge = VPXMAX(top_edge, bottom_edge);
   }
 
   if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
@@ -2888,7 +2921,7 @@ int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) {
     left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
 
     right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
-    right_edge = MAX(left_edge, right_edge);
+    right_edge = VPXMAX(left_edge, right_edge);
   }
 
   if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
@@ -3135,7 +3168,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
     }
 
     if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
-        (ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame))))
+        (ref_frame_skip_mask[1] & (1 << VPXMAX(0, second_ref_frame))))
       continue;
 
     if (mode_skip_mask[ref_frame] & (1 << this_mode))
@@ -3149,10 +3182,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
       continue;
 
     if (sf->motion_field_mode_search) {
-      const int mi_width  = MIN(num_8x8_blocks_wide_lookup[bsize],
-                                tile_info->mi_col_end - mi_col);
-      const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize],
-                                tile_info->mi_row_end - mi_row);
+      const int mi_width  = VPXMIN(num_8x8_blocks_wide_lookup[bsize],
+                                   tile_info->mi_col_end - mi_col);
+      const int mi_height = VPXMIN(num_8x8_blocks_high_lookup[bsize],
+                                   tile_info->mi_row_end - mi_row);
       const int bsl = mi_width_log2_lookup[bsize];
       int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
           + get_chessboard_index(cm->current_video_frame)) & 0x1;
@@ -3370,9 +3403,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < REFERENCE_MODES; ++i)
-        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
+        best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-        best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
+        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
     }
 
     // Did this mode help.. i.e. is it the new best mode
@@ -3471,7 +3504,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
             adj_rd = filter_cache[i] - ref;
 
           adj_rd += this_rd;
-          best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
+          best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
         }
       }
     }
@@ -3783,6 +3816,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
     int this_skip2 = 0;
     int64_t total_sse = INT_MAX;
     int early_term = 0;
+    struct buf_2d backup_yv12[2][MAX_MB_PLANE];
 
     ref_frame = vp9_ref_order[ref_index].ref_frame[0];
     second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
@@ -3814,7 +3848,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
     }
 
     if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
-        (ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame))))
+        (ref_frame_skip_mask[1] & (1 << VPXMAX(0, second_ref_frame))))
       continue;
 
     // Test best rd so far against threshold for trying this mode.
@@ -3840,16 +3874,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
         continue;
     }
 
-    // TODO(jingning, jkoleszar): scaling reference frame not supported for
-    // sub8x8 blocks.
-    if (ref_frame > INTRA_FRAME &&
-        vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
-      continue;
-
-    if (second_ref_frame > INTRA_FRAME &&
-        vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
-      continue;
-
     if (comp_pred)
       mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
     else if (ref_frame != INTRA_FRAME)
@@ -3928,6 +3952,25 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
       int pred_exists = 0;
       int uv_skippable;
 
+      YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
+      int ref;
+
+      for (ref = 0; ref < 2; ++ref) {
+        scaled_ref_frame[ref] = mbmi->ref_frame[ref] > INTRA_FRAME ?
+            vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[ref]) : NULL;
+
+        if (scaled_ref_frame[ref]) {
+          int i;
+          // Swap out the reference frame for a version that's been scaled to
+          // match the resolution of the current frame, allowing the existing
+          // motion search code to be used without additional modifications.
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_yv12[ref][i] = xd->plane[i].pre[ref];
+          vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                               NULL);
+        }
+      }
+
       this_rd_thresh = (ref_frame == LAST_FRAME) ?
           rd_opt->threshes[segment_id][bsize][THR_LAST] :
           rd_opt->threshes[segment_id][bsize][THR_ALTR];
@@ -3969,12 +4012,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
             filter_cache[switchable_filter_index] = tmp_rd;
             filter_cache[SWITCHABLE_FILTERS] =
-                MIN(filter_cache[SWITCHABLE_FILTERS],
-                    tmp_rd + rs_rd);
+                VPXMIN(filter_cache[SWITCHABLE_FILTERS], tmp_rd + rs_rd);
             if (cm->interp_filter == SWITCHABLE)
               tmp_rd += rs_rd;
 
-            mask_filter = MAX(mask_filter, tmp_rd);
+            mask_filter = VPXMAX(mask_filter, tmp_rd);
 
             newbest = (tmp_rd < tmp_best_rd);
             if (newbest) {
@@ -4051,9 +4093,9 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
 
       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
 
-      tmp_best_rdu = best_rd -
-          MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
-              RDCOST(x->rdmult, x->rddiv, 0, total_sse));
+      tmp_best_rdu =
+          best_rd - VPXMIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
+                           RDCOST(x->rdmult, x->rddiv, 0, total_sse));
 
       if (tmp_best_rdu > 0) {
         // If even the 'Y' rd value of split is higher than best so far
@@ -4062,14 +4104,31 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
                                         BLOCK_8X8);
         memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
         if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
-                              &uv_sse, BLOCK_8X8, tmp_best_rdu))
+                              &uv_sse, BLOCK_8X8, tmp_best_rdu)) {
+          for (ref = 0; ref < 2; ++ref) {
+            if (scaled_ref_frame[ref]) {
+              int i;
+              for (i = 0; i < MAX_MB_PLANE; ++i)
+                xd->plane[i].pre[ref] = backup_yv12[ref][i];
+            }
+          }
           continue;
+        }
 
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
         total_sse += uv_sse;
       }
+
+      for (ref = 0; ref < 2; ++ref) {
+        if (scaled_ref_frame[ref]) {
+          // Restore the prediction frame pointers to their unscaled versions.
+          int i;
+          for (i = 0; i < MAX_MB_PLANE; ++i)
+            xd->plane[i].pre[ref] = backup_yv12[ref][i];
+        }
+      }
     }
 
     if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -4113,9 +4172,9 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < REFERENCE_MODES; ++i)
-        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
+        best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-        best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
+        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
     }
 
     // Did this mode help.. i.e. is it the new best mode
@@ -4214,7 +4273,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
           adj_rd = filter_cache[i] - ref;
 
         adj_rd += this_rd;
-        best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
+        best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
       }
     }
 
diff --git a/libvpx/vp9/encoder/vp9_resize.h b/libvpx/vp9/encoder/vp9_resize.h
index 067af53f..b5feb386 100644
--- a/libvpx/vp9/encoder/vp9_resize.h
+++ b/libvpx/vp9/encoder/vp9_resize.h
@@ -14,6 +14,10 @@
 #include <stdio.h>
 #include "vpx/vpx_integer.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void vp9_resize_plane(const uint8_t *const input,
                       int height,
                       int width,
@@ -121,4 +125,9 @@ void vp9_highbd_resize_frame444(const uint8_t *const y,
                                 int owidth,
                                 int bd);
 #endif    // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif    // VP9_ENCODER_VP9_RESIZE_H_
diff --git a/libvpx/vp9/encoder/vp9_skin_detection.c b/libvpx/vp9/encoder/vp9_skin_detection.c
index aaa8ea07..c2763b7d 100644
--- a/libvpx/vp9/encoder/vp9_skin_detection.c
+++ b/libvpx/vp9/encoder/vp9_skin_detection.c
@@ -98,12 +98,13 @@ void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
       uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
       uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos  + 1)];
       uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos +  1)];
+      int is_skin = 0;
       if (mode_filter == 1) {
         ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2;
         usource = (usource + usource2 + usource3 + usource4) >> 2;
         vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2;
       }
-      const int is_skin = vp9_skin_pixel(ysource, usource, vsource);
+      is_skin = vp9_skin_pixel(ysource, usource, vsource);
       for (i = 0; i < y_bsize; i++) {
         for (j = 0; j < y_bsize; j++) {
           if (is_skin)
diff --git a/libvpx/vp9/encoder/vp9_skin_detection.h b/libvpx/vp9/encoder/vp9_skin_detection.h
index 3d4e7375..0a87ef9f 100644
--- a/libvpx/vp9/encoder/vp9_skin_detection.h
+++ b/libvpx/vp9/encoder/vp9_skin_detection.h
@@ -25,7 +25,8 @@ int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr);
 
 #ifdef OUTPUT_YUV_SKINMAP
 // For viewing skin map on input source.
-void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+extern void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f);
 #endif
 
 #ifdef __cplusplus
diff --git a/libvpx/vp9/encoder/vp9_speed_features.c b/libvpx/vp9/encoder/vp9_speed_features.c
index 5e72c4cb..a5396298 100644
--- a/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/libvpx/vp9/encoder/vp9_speed_features.c
@@ -13,6 +13,7 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_rdopt.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 
 // Intra only frames, golden frames (except alt ref overlays) and
@@ -49,7 +50,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
 
   if (speed >= 1) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                               : DISABLE_ALL_INTER_SPLIT;
       sf->partition_search_breakout_dist_thr = (1 << 23);
@@ -60,7 +61,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
   }
 
   if (speed >= 2) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                               : DISABLE_ALL_INTER_SPLIT;
       sf->adaptive_pred_interp_filter = 0;
@@ -75,7 +76,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
   }
 
   if (speed >= 3) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
       sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
       sf->partition_search_breakout_dist_thr = (1 << 25);
@@ -99,7 +100,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
   }
 
   if (speed >= 4) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
       sf->partition_search_breakout_dist_thr = (1 << 26);
     } else {
       sf->partition_search_breakout_dist_thr = (1 << 24);
@@ -112,8 +113,14 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
                                    SPEED_FEATURES *sf, int speed) {
   const int boosted = frame_is_boosted(cpi);
 
+  sf->partition_search_breakout_dist_thr = (1 << 20);
+  sf->partition_search_breakout_rate_thr = 80;
+  sf->tx_size_search_breakout = 1;
   sf->adaptive_rd_thresh = 1;
   sf->allow_skip_recode = 1;
+  sf->less_rectangular_check = 1;
+  sf->use_square_partition_only = !frame_is_boosted(cpi);
+  sf->use_square_only_threshold = BLOCK_16X16;
 
   if (speed >= 1) {
     if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
@@ -122,6 +129,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     } else {
       sf->use_square_partition_only = !frame_is_intra_only(cm);
     }
+    sf->use_square_only_threshold = BLOCK_4X4;
 
     sf->less_rectangular_check  = 1;
 
@@ -138,9 +146,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
-
-    sf->tx_size_search_breakout = 1;
-    sf->partition_search_breakout_rate_thr = 80;
   }
 
   if (speed >= 2) {
@@ -215,7 +220,7 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
 
   if (speed >= 1) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                               : DISABLE_ALL_INTER_SPLIT;
     } else {
@@ -224,7 +229,7 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
   }
 
   if (speed >= 2) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                               : DISABLE_ALL_INTER_SPLIT;
     } else {
@@ -233,7 +238,7 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
   }
 
   if (speed >= 5) {
-    if (MIN(cm->width, cm->height) >= 720) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
       sf->partition_search_breakout_dist_thr = (1 << 25);
     } else {
       sf->partition_search_breakout_dist_thr = (1 << 23);
@@ -241,7 +246,7 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
   }
 
   if (speed >= 7) {
-    sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
+    sf->encode_breakout_thresh = (VPXMIN(cm->width, cm->height) >= 720) ?
         800 : 300;
   }
 }
@@ -381,7 +386,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
   }
 
   if (speed >= 6) {
-    // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
     sf->partition_search_type = VAR_BASED_PARTITION;
     // Turn on this to use non-RD key frame coding mode.
     sf->use_nonrd_pick_mode = 1;
@@ -471,6 +475,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->partition_search_type = SEARCH_PARTITION;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
+  sf->use_square_only_threshold = BLOCK_SIZES;
   sf->auto_min_max_partition_size = NOT_IN_USE;
   sf->rd_auto_partition_min_limit = BLOCK_4X4;
   sf->default_max_partition_size = BLOCK_64X64;
diff --git a/libvpx/vp9/encoder/vp9_speed_features.h b/libvpx/vp9/encoder/vp9_speed_features.h
index 95038cee..575e98cf 100644
--- a/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/libvpx/vp9/encoder/vp9_speed_features.h
@@ -267,6 +267,7 @@ typedef struct SPEED_FEATURES {
 
   // Disable testing non square partitions. (eg 16x32)
   int use_square_partition_only;
+  BLOCK_SIZE use_square_only_threshold;
 
   // Sets min and max partition sizes for this 64x64 region based on the
   // same 64x64 in last encoded frame, and the left and above neighbor.
diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libvpx/vp9/encoder/vp9_svc_layercontext.c
index e69404ad..8a6818c8 100644
--- a/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -10,9 +10,11 @@
 
 #include <math.h>
 
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_extend.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 #define SMALL_FRAME_FB_IDX 7
 #define SMALL_FRAME_WIDTH  32
@@ -21,11 +23,14 @@
 void vp9_init_layer_context(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  int mi_rows = cpi->common.mi_rows;
+  int mi_cols = cpi->common.mi_cols;
   int sl, tl;
   int alt_ref_idx = svc->number_spatial_layers;
 
   svc->spatial_layer_id = 0;
   svc->temporal_layer_id = 0;
+  svc->first_spatial_layer_to_encode = 0;
 
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
     if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img,
@@ -93,6 +98,26 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
       lrc->buffer_level = oxcf->starting_buffer_level_ms *
                               lc->target_bandwidth / 1000;
       lrc->bits_off_target = lrc->buffer_level;
+
+      // Initialize the cyclic refresh parameters. If spatial layers are used
+      // (i.e., ss_number_layers > 1), these need to be updated per spatial
+      // layer.
+      // Cyclic refresh is only applied on base temporal layer.
+      if (oxcf->ss_number_layers > 1 &&
+          tl == 0) {
+        size_t last_coded_q_map_size;
+        size_t consec_zero_mv_size;
+        lc->sb_index = 0;
+        lc->map = vpx_malloc(mi_rows * mi_cols * sizeof(signed char));
+        memset(lc->map, 0, mi_rows * mi_cols);
+        last_coded_q_map_size = mi_rows * mi_cols * sizeof(uint8_t);
+        lc->last_coded_q_map = vpx_malloc(last_coded_q_map_size);
+        assert(MAXQ <= 255);
+        memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
+        consec_zero_mv_size = mi_rows * mi_cols * sizeof(uint8_t);
+        lc->consec_zero_mv = vpx_malloc(consec_zero_mv_size);
+        memset(lc->consec_zero_mv, 0, consec_zero_mv_size);
+       }
     }
   }
 
@@ -113,8 +138,6 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
 
   if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
     for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
-      spatial_layer_target = 0;
-
       for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
         layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
         svc->layer_context[layer].target_bandwidth =
@@ -141,8 +164,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
         lrc->maximum_buffer_size =
             (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
         lrc->bits_off_target =
-            MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
-        lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
+            VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+        lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size);
         lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
         lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
         lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
@@ -173,9 +196,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
           (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
       lrc->maximum_buffer_size =
           (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
-      lrc->bits_off_target = MIN(lrc->bits_off_target,
-                                 lrc->maximum_buffer_size);
-      lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
+      lrc->bits_off_target = VPXMIN(lrc->bits_off_target,
+                                    lrc->maximum_buffer_size);
+      lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size);
       // Update framerate-related quantities.
       if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
         lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer];
@@ -258,6 +281,24 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
     cpi->rc.frames_since_key = old_frame_since_key;
     cpi->rc.frames_to_key = old_frame_to_key;
   }
+
+  // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+  // for the base temporal layer.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      cpi->svc.number_spatial_layers > 1 &&
+      cpi->svc.temporal_layer_id == 0) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    signed char *temp = cr->map;
+    uint8_t *temp2 = cr->last_coded_q_map;
+    uint8_t *temp3 = cr->consec_zero_mv;
+    cr->map = lc->map;
+    lc->map = temp;
+    cr->last_coded_q_map = lc->last_coded_q_map;
+    lc->last_coded_q_map = temp2;
+    cr->consec_zero_mv = lc->consec_zero_mv;
+    lc->consec_zero_mv = temp3;
+    cr->sb_index = lc->sb_index;
+  }
 }
 
 void vp9_save_layer_context(VP9_COMP *const cpi) {
@@ -268,6 +309,24 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
   lc->twopass = cpi->twopass;
   lc->target_bandwidth = (int)oxcf->target_bandwidth;
   lc->alt_ref_source = cpi->alt_ref_source;
+
+  // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+  // for the base temporal layer.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      cpi->svc.number_spatial_layers > 1 &&
+      cpi->svc.temporal_layer_id == 0) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    signed char *temp = lc->map;
+    uint8_t *temp2 = lc->last_coded_q_map;
+    uint8_t *temp3 = lc->consec_zero_mv;
+    lc->map = cr->map;
+    cr->map = temp;
+    lc->last_coded_q_map = cr->last_coded_q_map;
+    cr->last_coded_q_map = temp2;
+    lc->consec_zero_mv = cr->consec_zero_mv;
+    cr->consec_zero_mv = temp3;
+    lc->sb_index = cr->sb_index;
+  }
 }
 
 void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
@@ -492,19 +551,35 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
     set_flags_and_fb_idx_for_temporal_mode2(cpi);
   } else if (cpi->svc.temporal_layering_mode ==
       VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
-    // VP9E_TEMPORAL_LAYERING_MODE_BYPASS :
-    // if the code goes here, it means the encoder will be relying on the
-    // flags from outside for layering.
-    // However, since when spatial+temporal layering is used, the buffer indices
-    // cannot be derived automatically, the bypass mode will only work when the
-    // number of spatial layers equals 1.
-    assert(cpi->svc.number_spatial_layers == 1);
+    // In the BYPASS/flexible mode, the encoder is relying on the application
+    // to specify, for each spatial layer, the flags and buffer indices for the
+    // layering.
+    // Note that the check (cpi->ext_refresh_frame_flags_pending == 0) is
+    // needed to support the case where the frame flags may be passed in via
+    // vpx_codec_encode(), which can be used for the temporal-only svc case.
+    if (cpi->ext_refresh_frame_flags_pending == 0) {
+      int sl;
+      cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
+      sl = cpi->svc.spatial_layer_id;
+      vp9_apply_encoding_flags(cpi, cpi->svc.ext_frame_flags[sl]);
+      cpi->lst_fb_idx = cpi->svc.ext_lst_fb_idx[sl];
+      cpi->gld_fb_idx = cpi->svc.ext_gld_fb_idx[sl];
+      cpi->alt_fb_idx = cpi->svc.ext_alt_fb_idx[sl];
+    }
   }
 
   lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
                                cpi->svc.number_temporal_layers +
                                cpi->svc.temporal_layer_id];
 
+  // Setting the worst/best_quality via the encoder control: SET_SVC_PARAMETERS,
+  // only for non-BYPASS mode for now.
+  if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+    RATE_CONTROL *const lrc = &lc->rc;
+    lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q);
+    lrc->best_quality =  vp9_quantizer_to_qindex(lc->min_q);
+  }
+
   get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
                        lc->scaling_factor_num, lc->scaling_factor_den,
                        &width, &height);
@@ -643,3 +718,21 @@ struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi,
   }
   return buf;
 }
+
+void vp9_free_svc_cyclic_refresh(VP9_COMP *const cpi) {
+  int sl, tl;
+  SVC *const svc = &cpi->svc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+    for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+      int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+        if (lc->map)
+          vpx_free(lc->map);
+        if (lc->last_coded_q_map)
+          vpx_free(lc->last_coded_q_map);
+        if (lc->consec_zero_mv)
+          vpx_free(lc->consec_zero_mv);
+    }
+  }
+}
diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libvpx/vp9/encoder/vp9_svc_layercontext.h
index b6a5ea54..694b5abd 100644
--- a/libvpx/vp9/encoder/vp9_svc_layercontext.h
+++ b/libvpx/vp9/encoder/vp9_svc_layercontext.h
@@ -41,6 +41,11 @@ typedef struct {
   int has_alt_frame;
   size_t layer_size;
   struct vpx_psnr_pkt psnr_pkt;
+  // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+  int sb_index;
+  signed char *map;
+  uint8_t *last_coded_q_map;
+  uint8_t *consec_zero_mv;
 } LAYER_CONTEXT;
 
 typedef struct {
@@ -50,6 +55,7 @@ typedef struct {
   int number_temporal_layers;
 
   int spatial_layer_to_encode;
+  int first_spatial_layer_to_encode;
 
   // Workaround for multiple frame contexts
   enum {
@@ -70,6 +76,12 @@ typedef struct {
   // Indicates what sort of temporal layering is used.
   // Currently, this only works for CBR mode.
   VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
+  // Frame flags and buffer indexes for each spatial layer, set by the
+  // application (external settings).
+  int ext_frame_flags[VPX_MAX_LAYERS];
+  int ext_lst_fb_idx[VPX_MAX_LAYERS];
+  int ext_gld_fb_idx[VPX_MAX_LAYERS];
+  int ext_alt_fb_idx[VPX_MAX_LAYERS];
 } SVC;
 
 struct VP9_COMP;
@@ -115,6 +127,8 @@ int vp9_svc_start_frame(struct VP9_COMP *const cpi);
 
 int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi);
 
+void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c
index 439eac6b..16f9c857 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -23,6 +23,7 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
@@ -216,7 +217,8 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                               int stride) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const SEARCH_METHODS old_search_method = mv_sf->search_method;
   int step_param;
   int sadpb = x->sadperbit16;
   int bestsme = INT_MAX;
@@ -242,12 +244,13 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   xd->plane[0].pre[0].stride = stride;
 
   step_param = mv_sf->reduce_first_step_size;
-  step_param = MIN(step_param, MAX_MVSEARCH_STEPS - 2);
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
 
-  // Ignore mv costing by sending NULL pointer instead of cost arrays
-  vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
-                 cond_cost_list(cpi, cost_list),
-                 &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
+  mv_sf->search_method = HEX;
+  vp9_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param,
+                        sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1,
+                        ref_mv, 0, 0);
+  mv_sf->search_method = old_search_method;
 
   // Ignore mv costing by sending NULL pointer instead of cost array
   bestsme = cpi->find_fractional_mv_step(x, ref_mv,
@@ -718,7 +721,7 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
                                "Failed to reallocate alt_ref_buffer");
           }
           frames[frame] = vp9_scale_if_required(
-              cm, frames[frame], &cpi->svc.scaled_frames[frame_used]);
+              cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0);
           ++frame_used;
         }
       }
diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c
index 85cb2fce..6076e2a6 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/libvpx/vp9/encoder/vp9_tokenize.c
@@ -66,14 +66,6 @@ const vpx_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
   -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
 };
 
-static const vpx_tree_index cat1[2] = {0, 0};
-static const vpx_tree_index cat2[4] = {2, 2, 0, 0};
-static const vpx_tree_index cat3[6] = {2, 2, 4, 4, 0, 0};
-static const vpx_tree_index cat4[8] = {2, 2, 4, 4, 6, 6, 0, 0};
-static const vpx_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
-static const vpx_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
-    14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0};
-
 static const int16_t zero_cost[] = {0};
 static const int16_t one_cost[] = {255, 257};
 static const int16_t two_cost[] = {255, 257};
@@ -366,68 +358,49 @@ const int16_t vp9_cat6_high12_high_cost[2048] = {
 };
 #endif
 
-#if CONFIG_VP9_HIGHBITDEPTH
-static const vpx_tree_index cat1_high10[2] = {0, 0};
-static const vpx_tree_index cat2_high10[4] = {2, 2, 0, 0};
-static const vpx_tree_index cat3_high10[6] = {2, 2, 4, 4, 0, 0};
-static const vpx_tree_index cat4_high10[8] = {2, 2, 4, 4, 6, 6, 0, 0};
-static const vpx_tree_index cat5_high10[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
-static const vpx_tree_index cat6_high10[32] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
-  12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
-  30, 30, 0, 0};
-static const vpx_tree_index cat1_high12[2] = {0, 0};
-static const vpx_tree_index cat2_high12[4] = {2, 2, 0, 0};
-static const vpx_tree_index cat3_high12[6] = {2, 2, 4, 4, 0, 0};
-static const vpx_tree_index cat4_high12[8] = {2, 2, 4, 4, 6, 6, 0, 0};
-static const vpx_tree_index cat5_high12[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
-static const vpx_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
-  12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
-  30, 30, 32, 32, 34, 34, 0, 0};
-#endif
-
 const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
-  {0, 0, 0, 0, zero_cost},                             // ZERO_TOKEN
-  {0, 0, 0, 1, one_cost},                              // ONE_TOKEN
-  {0, 0, 0, 2, two_cost},                              // TWO_TOKEN
-  {0, 0, 0, 3, three_cost},                            // THREE_TOKEN
-  {0, 0, 0, 4, four_cost},                             // FOUR_TOKEN
-  {cat1, vp9_cat1_prob, 1,  CAT1_MIN_VAL, cat1_cost},  // CATEGORY1_TOKEN
-  {cat2, vp9_cat2_prob, 2,  CAT2_MIN_VAL, cat2_cost},  // CATEGORY2_TOKEN
-  {cat3, vp9_cat3_prob, 3,  CAT3_MIN_VAL, cat3_cost},  // CATEGORY3_TOKEN
-  {cat4, vp9_cat4_prob, 4,  CAT4_MIN_VAL, cat4_cost},  // CATEGORY4_TOKEN
-  {cat5, vp9_cat5_prob, 5,  CAT5_MIN_VAL, cat5_cost},  // CATEGORY5_TOKEN
-  {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL, 0},          // CATEGORY6_TOKEN
-  {0, 0, 0, 0, zero_cost}                              // EOB_TOKEN
+  {0, 0, 0, zero_cost},                          // ZERO_TOKEN
+  {0, 0, 1, one_cost},                           // ONE_TOKEN
+  {0, 0, 2, two_cost},                           // TWO_TOKEN
+  {0, 0, 3, three_cost},                         // THREE_TOKEN
+  {0, 0, 4, four_cost},                          // FOUR_TOKEN
+  {vp9_cat1_prob, 1,  CAT1_MIN_VAL, cat1_cost},  // CATEGORY1_TOKEN
+  {vp9_cat2_prob, 2,  CAT2_MIN_VAL, cat2_cost},  // CATEGORY2_TOKEN
+  {vp9_cat3_prob, 3,  CAT3_MIN_VAL, cat3_cost},  // CATEGORY3_TOKEN
+  {vp9_cat4_prob, 4,  CAT4_MIN_VAL, cat4_cost},  // CATEGORY4_TOKEN
+  {vp9_cat5_prob, 5,  CAT5_MIN_VAL, cat5_cost},  // CATEGORY5_TOKEN
+  {vp9_cat6_prob, 14, CAT6_MIN_VAL, 0},          // CATEGORY6_TOKEN
+  {0, 0, 0, zero_cost}                           // EOB_TOKEN
 };
 
 #if CONFIG_VP9_HIGHBITDEPTH
 const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS] = {
-  {0, 0, 0, 0, zero_cost},                                           // ZERO
-  {0, 0, 0, 1, one_cost},                                            // ONE
-  {0, 0, 0, 2, two_cost},                                            // TWO
-  {0, 0, 0, 3, three_cost},                                          // THREE
-  {0, 0, 0, 4, four_cost},                                           // FOUR
-  {cat1_high10, vp9_cat1_prob_high10, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
-  {cat2_high10, vp9_cat2_prob_high10, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
-  {cat3_high10, vp9_cat3_prob_high10, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
-  {cat4_high10, vp9_cat4_prob_high10, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
-  {cat5_high10, vp9_cat5_prob_high10, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
-  {cat6_high10, vp9_cat6_prob_high10, 16, CAT6_MIN_VAL, 0},          // CAT6
-  {0, 0, 0, 0, zero_cost}                                            // EOB
+  {0, 0, 0, zero_cost},                                 // ZERO
+  {0, 0, 1, one_cost},                                  // ONE
+  {0, 0, 2, two_cost},                                  // TWO
+  {0, 0, 3, three_cost},                                // THREE
+  {0, 0, 4, four_cost},                                 // FOUR
+  {vp9_cat1_prob_high10, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
+  {vp9_cat2_prob_high10, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
+  {vp9_cat3_prob_high10, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
+  {vp9_cat4_prob_high10, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
+  {vp9_cat5_prob_high10, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
+  {vp9_cat6_prob_high10, 16, CAT6_MIN_VAL, 0},          // CAT6
+  {0, 0, 0, zero_cost}                                  // EOB
 };
 const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = {
-  {0, 0, 0, 0, zero_cost},                                           // ZERO
-  {0, 0, 0, 1, one_cost},                                            // ONE
-  {0, 0, 0, 2, two_cost},                                            // TWO
-  {0, 0, 0, 3, three_cost},                                          // THREE
-  {0, 0, 0, 4, four_cost},                                           // FOUR
-  {cat1_high12, vp9_cat1_prob_high12, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
-  {cat2_high12, vp9_cat2_prob_high12, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
-  {cat3_high12, vp9_cat3_prob_high12, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
-  {cat4_high12, vp9_cat4_prob_high12, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
-  {cat5_high12, vp9_cat5_prob_high12, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
-  {cat6_high12, vp9_cat6_prob_high12, 18, CAT6_MIN_VAL, 0},          // CAT6
-  {0, 0, 0, 0, zero_cost}                                            // EOB
+  {0, 0, 0, zero_cost},                                 // ZERO
+  {0, 0, 1, one_cost},                                  // ONE
+  {0, 0, 2, two_cost},                                  // TWO
+  {0, 0, 3, three_cost},                                // THREE
+  {0, 0, 4, four_cost},                                 // FOUR
+  {vp9_cat1_prob_high12, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
+  {vp9_cat2_prob_high12, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
+  {vp9_cat3_prob_high12, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
+  {vp9_cat4_prob_high12, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
+  {vp9_cat5_prob_high12, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
+  {vp9_cat6_prob_high12, 18, CAT6_MIN_VAL, 0},          // CAT6
+  {0, 0, 0, zero_cost}                                  // EOB
 };
 #endif
 
@@ -503,7 +476,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   int c;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   int eob = p->eobs[block];
-  const PLANE_TYPE type = pd->plane_type;
+  const PLANE_TYPE type = get_plane_type(plane);
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;
diff --git a/libvpx/vp9/encoder/vp9_tokenize.h b/libvpx/vp9/encoder/vp9_tokenize.h
index 11b78ba3..c0f09c7b 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/libvpx/vp9/encoder/vp9_tokenize.h
@@ -54,6 +54,20 @@ struct ThreadData;
 void vp9_tokenize_sb(struct VP9_COMP *cpi, struct ThreadData *td,
                      TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
 
+typedef struct {
+  const vpx_prob *prob;
+  int len;
+  int base_val;
+  const int16_t *cost;
+} vp9_extra_bit;
+
+// indexed by token value
+extern const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS];
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS];
+extern const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 extern const int16_t *vp9_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
  *  improve cache locality, since it's needed for costing when the rest of the
diff --git a/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm b/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm
new file mode 100644
index 00000000..e476323e
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm
@@ -0,0 +1,261 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+ALIGN 16
+
+;
+; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
+;                                     intptr_t block_size, int64_t *ssz)
+;
+
+INIT_XMM avx
+cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
+  vzeroupper
+
+  ; If only one iteration is required, then handle this as a special case.
+  ; It is the most frequent case, so we can have a significant gain here
+  ; by not setting up a loop and accumulators.
+  cmp    sizeq, 16
+  jne   .generic
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Common case of size == 16
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+  ; Load input vectors
+  mova      xm0, [dqcq]
+  packssdw  xm0, [dqcq+16]
+  mova      xm2, [uqcq]
+  packssdw  xm2, [uqcq+16]
+
+  mova      xm1, [dqcq+32]
+  packssdw  xm1, [dqcq+48]
+  mova      xm3, [uqcq+32]
+  packssdw  xm3, [uqcq+48]
+
+  ; Compute the errors.
+  psubw     xm0, xm2
+  psubw     xm1, xm3
+
+  ; Individual errors are max 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
+  pmaddwd   xm2, xm2
+  pmaddwd   xm3, xm3
+
+  pmaddwd   xm0, xm0
+  pmaddwd   xm1, xm1
+
+  ; Squares are always positive, so we can use unsigned arithmetic after
+  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
+  ; fit in 32bits
+  paddd     xm2, xm3
+  paddd     xm0, xm1
+
+  ; Accumulate horizontally in 64 bits, there is no chance of overflow here
+  pxor      xm5, xm5
+
+  pblendw   xm3, xm5, xm2, 0x33 ; Zero extended  low of a pair of 32 bits
+  psrlq     xm2, 32             ; Zero extended high of a pair of 32 bits
+
+  pblendw   xm1, xm5, xm0, 0x33 ; Zero extended  low of a pair of 32 bits
+  psrlq     xm0, 32             ; Zero extended high of a pair of 32 bits
+
+  paddq     xm2, xm3
+  paddq     xm0, xm1
+
+  psrldq    xm3, xm2, 8
+  psrldq    xm1, xm0, 8
+
+  paddq     xm2, xm3
+  paddq     xm0, xm1
+
+  ; Store the return value
+%if ARCH_X86_64
+  movq      rax, xm0
+  movq   [sszq], xm2
+%else
+  movd      eax, xm0
+  pextrd    edx, xm0, 1
+  movq   [sszd], xm2
+%endif
+  RET
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Generic case of size != 16, speculative low precision
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ALIGN 16
+.generic:
+  pxor      xm4, xm4                ; sse accumulator
+  pxor      xm5, xm5                ; overflow detection register for xm4
+  pxor      xm6, xm6                ; ssz accumulator
+  pxor      xm7, xm7                ; overflow detection register for xm6
+  lea      uqcq, [uqcq+sizeq*4]
+  lea      dqcq, [dqcq+sizeq*4]
+  neg     sizeq
+
+  ; Push the negative size as the high precision code might need it
+  push    sizeq
+
+.loop:
+  ; Load input vectors
+  mova      xm0, [dqcq+sizeq*4]
+  packssdw  xm0, [dqcq+sizeq*4+16]
+  mova      xm2, [uqcq+sizeq*4]
+  packssdw  xm2, [uqcq+sizeq*4+16]
+
+  mova      xm1, [dqcq+sizeq*4+32]
+  packssdw  xm1, [dqcq+sizeq*4+48]
+  mova      xm3, [uqcq+sizeq*4+32]
+  packssdw  xm3, [uqcq+sizeq*4+48]
+
+  add     sizeq, 16
+
+  ; Compute the squared errors.
+  ; Individual errors are max 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
+  psubw     xm0, xm2
+  pmaddwd   xm2, xm2
+  pmaddwd   xm0, xm0
+
+  psubw     xm1, xm3
+  pmaddwd   xm3, xm3
+  pmaddwd   xm1, xm1
+
+  ; Squares are always positive, so we can use unsigned arithmetic after
+  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
+  ; fit in 32bits
+  paddd     xm2, xm3
+  paddd     xm0, xm1
+
+  ; We accumulate using 32 bit arithmetic, but detect potential overflow
+  ; by checking if the MSB of the accumulators have ever been a set bit.
+  ; If yes, we redo the whole compute at the end on higher precision, but
+  ; this happens extremely rarely, so we still achieve a net gain.
+  paddd     xm4, xm0
+  paddd     xm6, xm2
+  por       xm5, xm4  ; OR in the accumulator for overflow detection
+  por       xm7, xm6  ; OR in the accumulator for overflow detection
+
+  jnz .loop
+
+  ; Add pairs horizontally (still only on 32 bits)
+  phaddd    xm4, xm4
+  por       xm5, xm4  ; OR in the accumulator for overflow detection
+  phaddd    xm6, xm6
+  por       xm7, xm6  ; OR in the accumulator for overflow detection
+
+  ; Check for possibility of overflow by testing if bit 32 of each dword lane
+  ; have ever been set. If they were not, then there was no overflow and the
+  ; final sum will fit in 32 bits. If overflow happened, then
+  ; we redo the whole computation on higher precision.
+  por       xm7, xm5
+  pmovmskb   r4, xm7
+  test       r4, 0x8888
+  jnz .highprec
+
+  phaddd    xm4, xm4
+  phaddd    xm6, xm6
+  pmovzxdq  xm4, xm4
+  pmovzxdq  xm6, xm6
+
+  ; Restore stack
+  pop     sizeq
+
+  ; Store the return value
+%if ARCH_X86_64
+  movq      rax, xm4
+  movq   [sszq], xm6
+%else
+  movd      eax, xm4
+  pextrd    edx, xm4, 1
+  movq   [sszd], xm6
+%endif
+  RET
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Generic case of size != 16, high precision case
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+.highprec:
+  pxor      xm4, xm4                 ; sse accumulator
+  pxor      xm5, xm5                 ; dedicated zero register
+  pxor      xm6, xm6                 ; ssz accumulator
+  pop     sizeq
+
+.loophp:
+  mova      xm0, [dqcq+sizeq*4]
+  packssdw  xm0, [dqcq+sizeq*4+16]
+  mova      xm2, [uqcq+sizeq*4]
+  packssdw  xm2, [uqcq+sizeq*4+16]
+
+  mova      xm1, [dqcq+sizeq*4+32]
+  packssdw  xm1, [dqcq+sizeq*4+48]
+  mova      xm3, [uqcq+sizeq*4+32]
+  packssdw  xm3, [uqcq+sizeq*4+48]
+
+  add     sizeq, 16
+
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+
+  psubw     xm0, xm2
+  pmaddwd   xm2, xm2
+  pmaddwd   xm0, xm0
+
+  psubw     xm1, xm3
+  pmaddwd   xm3, xm3
+  pmaddwd   xm1, xm1
+
+  ; accumulate in 64bit
+  punpckldq xm7, xm0, xm5
+  punpckhdq xm0, xm5
+  paddq     xm4, xm7
+
+  punpckldq xm7, xm2, xm5
+  punpckhdq xm2, xm5
+  paddq     xm6, xm7
+
+  punpckldq xm7, xm1, xm5
+  punpckhdq xm1, xm5
+  paddq     xm4, xm7
+
+  punpckldq xm7, xm3, xm5
+  punpckhdq xm3, xm5
+  paddq     xm6, xm7
+
+  paddq     xm4, xm0
+  paddq     xm4, xm1
+  paddq     xm6, xm2
+  paddq     xm6, xm3
+
+  jnz .loophp
+
+  ; Accumulate horizontally
+  movhlps   xm5, xm4
+  movhlps   xm7, xm6
+  paddq     xm4, xm5
+  paddq     xm6, xm7
+
+  ; Store the return value
+%if ARCH_X86_64
+  movq      rax, xm4
+  movq   [sszq], xm6
+%else
+  movd      eax, xm4
+  pextrd    edx, xm4, 1
+  movq   [sszd], xm6
+%endif
+  RET
+
+END
diff --git a/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm b/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm
new file mode 100644
index 00000000..f3b8f019
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm
@@ -0,0 +1,98 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+ALIGN 16
+
+;
+; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
+;                                     intptr_t block_size, int64_t *ssz)
+;
+
+INIT_XMM sse2
+cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m6, m6                 ; ssz accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*4]
+  lea     dqcq, [dqcq+sizeq*4]
+  neg    sizeq
+
+  ALIGN 16
+
+.loop:
+  mova      m0, [dqcq+sizeq*4]
+  packssdw  m0, [dqcq+sizeq*4+mmsize]
+  mova      m2, [uqcq+sizeq*4]
+  packssdw  m2, [uqcq+sizeq*4+mmsize]
+
+  mova      m1, [dqcq+sizeq*4+mmsize*2]
+  packssdw  m1, [dqcq+sizeq*4+mmsize*3]
+  mova      m3, [uqcq+sizeq*4+mmsize*2]
+  packssdw  m3, [uqcq+sizeq*4+mmsize*3]
+
+  add    sizeq, mmsize
+
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+
+  psubw     m0, m2
+  pmaddwd   m2, m2
+  pmaddwd   m0, m0
+
+  psubw     m1, m3
+  pmaddwd   m3, m3
+  pmaddwd   m1, m1
+
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+
+  punpckldq m7, m2, m5
+  punpckhdq m2, m5
+  paddq     m6, m7
+
+  punpckldq m7, m1, m5
+  punpckhdq m1, m5
+  paddq     m4, m7
+
+  punpckldq m7, m3, m5
+  punpckhdq m3, m5
+  paddq     m6, m7
+
+  paddq     m4, m0
+  paddq     m4, m1
+  paddq     m6, m2
+  paddq     m6, m3
+
+  jnz .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  movhlps   m7, m6
+  paddq     m4, m5
+  paddq     m6, m7
+
+%if ARCH_X86_64
+  movq    rax, m4
+  movq [sszq], m6
+%else
+  mov     eax, sszm
+  pshufd   m5, m4, 0x1
+  movq  [eax], m6
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c
index f155b9ae..6ccba0f8 100644
--- a/libvpx/vp9/vp9_cx_iface.c
+++ b/libvpx/vp9/vp9_cx_iface.c
@@ -45,6 +45,9 @@ struct vp9_extracfg {
   vpx_bit_depth_t             bit_depth;
   vp9e_tune_content           content;
   vpx_color_space_t           color_space;
+  vpx_color_range_t           color_range;
+  int                         render_width;
+  int                         render_height;
 };
 
 static struct vp9_extracfg default_extra_cfg = {
@@ -71,6 +74,9 @@ static struct vp9_extracfg default_extra_cfg = {
   VPX_BITS_8,                 // Bit depth
   VP9E_CONTENT_DEFAULT,       // content
   VPX_CS_UNKNOWN,             // color space
+  0,                          // color range
+  0,                          // render width
+  0,                          // render height
 };
 
 struct vpx_codec_alg_priv {
@@ -321,6 +327,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
     ERROR("Codec bit-depth 8 not supported in profile > 1");
   }
   RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB);
+  RANGE_CHECK(extra_cfg, color_range,
+              VPX_CR_STUDIO_RANGE, VPX_CR_FULL_RANGE);
   return VPX_CODEC_OK;
 }
 
@@ -465,6 +473,9 @@ static vpx_codec_err_t set_encoder_config(
 #endif
 
   oxcf->color_space = extra_cfg->color_space;
+  oxcf->color_range = extra_cfg->color_range;
+  oxcf->render_width  = extra_cfg->render_width;
+  oxcf->render_height = extra_cfg->render_height;
   oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
   oxcf->arnr_strength   = extra_cfg->arnr_strength;
   oxcf->min_gf_interval = extra_cfg->min_gf_interval;
@@ -1256,30 +1267,6 @@ static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) {
   }
 }
 
-static vpx_codec_err_t ctrl_update_entropy(vpx_codec_alg_priv_t *ctx,
-                                           va_list args) {
-  const int update = va_arg(args, int);
-
-  vp9_update_entropy(ctx->cpi, update);
-  return VPX_CODEC_OK;
-}
-
-static vpx_codec_err_t ctrl_update_reference(vpx_codec_alg_priv_t *ctx,
-                                             va_list args) {
-  const int ref_frame_flags = va_arg(args, int);
-
-  vp9_update_reference(ctx->cpi, ref_frame_flags);
-  return VPX_CODEC_OK;
-}
-
-static vpx_codec_err_t ctrl_use_reference(vpx_codec_alg_priv_t *ctx,
-                                          va_list args) {
-  const int reference_flag = va_arg(args, int);
-
-  vp9_use_as_reference(ctx->cpi, reference_flag);
-  return VPX_CODEC_OK;
-}
-
 static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
   (void)ctx;
@@ -1362,17 +1349,21 @@ static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx,
   VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
   SVC *const svc = &cpi->svc;
 
-  svc->spatial_layer_id = data->spatial_layer_id;
+  svc->first_spatial_layer_to_encode = data->spatial_layer_id;
+  svc->spatial_layer_to_encode = data->spatial_layer_id;
   svc->temporal_layer_id = data->temporal_layer_id;
   // Checks on valid layer_id input.
   if (svc->temporal_layer_id < 0 ||
       svc->temporal_layer_id >= (int)ctx->cfg.ts_number_layers) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  if (svc->spatial_layer_id < 0 ||
-      svc->spatial_layer_id >= (int)ctx->cfg.ss_number_layers) {
+  if (svc->first_spatial_layer_to_encode < 0 ||
+      svc->first_spatial_layer_to_encode >= (int)ctx->cfg.ss_number_layers) {
     return VPX_CODEC_INVALID_PARAM;
   }
+  // First spatial layer to encode not implemented for two-pass.
+  if (is_two_pass_svc(cpi) && svc->first_spatial_layer_to_encode > 0)
+    return VPX_CODEC_INVALID_PARAM;
   return VPX_CODEC_OK;
 }
 
@@ -1412,6 +1403,20 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *);
+  int sl;
+  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
+    cpi->svc.ext_frame_flags[sl] = data->frame_flags[sl];
+    cpi->svc.ext_lst_fb_idx[sl] = data->lst_fb_idx[sl];
+    cpi->svc.ext_gld_fb_idx[sl] = data->gld_fb_idx[sl];
+    cpi->svc.ext_alt_fb_idx[sl] = data->alt_fb_idx[sl];
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_register_cx_callback(vpx_codec_alg_priv_t *ctx,
                                                  va_list args) {
   vpx_codec_priv_output_cx_pkt_cb_pair_t *cbp =
@@ -1436,11 +1441,24 @@ static vpx_codec_err_t ctrl_set_color_space(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_color_range(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_range = CAST(VP9E_SET_COLOR_RANGE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_render_size(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  int *const render_size = va_arg(args, int *);
+  extra_cfg.render_width  = render_size[0];
+  extra_cfg.render_height = render_size[1];
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP8_COPY_REFERENCE,                ctrl_copy_reference},
-  {VP8E_UPD_ENTROPY,                  ctrl_update_entropy},
-  {VP8E_UPD_REFERENCE,                ctrl_update_reference},
-  {VP8E_USE_REFERENCE,                ctrl_use_reference},
 
   // Setters
   {VP8_SET_REFERENCE,                 ctrl_set_reference},
@@ -1472,9 +1490,12 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP9E_SET_SVC_LAYER_ID,             ctrl_set_svc_layer_id},
   {VP9E_SET_TUNE_CONTENT,             ctrl_set_tune_content},
   {VP9E_SET_COLOR_SPACE,              ctrl_set_color_space},
+  {VP9E_SET_COLOR_RANGE,              ctrl_set_color_range},
   {VP9E_SET_NOISE_SENSITIVITY,        ctrl_set_noise_sensitivity},
   {VP9E_SET_MIN_GF_INTERVAL,          ctrl_set_min_gf_interval},
   {VP9E_SET_MAX_GF_INTERVAL,          ctrl_set_max_gf_interval},
+  {VP9E_SET_SVC_REF_FRAME_CONFIG,     ctrl_set_svc_ref_frame_config},
+  {VP9E_SET_RENDER_SIZE,              ctrl_set_render_size},
 
   // Getters
   {VP8E_GET_LAST_QUANTIZER,           ctrl_get_quantizer},
diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c
index 96ede3c4..be5d1600 100644
--- a/libvpx/vp9/vp9_dx_iface.c
+++ b/libvpx/vp9/vp9_dx_iface.c
@@ -18,67 +18,19 @@
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx_dsp/bitreader_buffer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_frame_buffers.h"
 
-#include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_decodeframe.h"
 
+#include "vp9/vp9_dx_iface.h"
 #include "vp9/vp9_iface_common.h"
 
 #define VP9_CAP_POSTPROC (CONFIG_VP9_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
 
-typedef vpx_codec_stream_info_t vp9_stream_info_t;
-
-// This limit is due to framebuffer numbers.
-// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
-#define FRAME_CACHE_SIZE 6   // Cache maximum 6 decoded frames.
-
-typedef struct cache_frame {
-  int fb_idx;
-  vpx_image_t img;
-} cache_frame;
-
-struct vpx_codec_alg_priv {
-  vpx_codec_priv_t        base;
-  vpx_codec_dec_cfg_t     cfg;
-  vp9_stream_info_t       si;
-  int                     postproc_cfg_set;
-  vp8_postproc_cfg_t      postproc_cfg;
-  vpx_decrypt_cb          decrypt_cb;
-  void                    *decrypt_state;
-  vpx_image_t             img;
-  int                     img_avail;
-  int                     flushed;
-  int                     invert_tile_order;
-  int                     last_show_frame;  // Index of last output frame.
-  int                     byte_alignment;
-  int                     skip_loop_filter;
-
-  // Frame parallel related.
-  int                     frame_parallel_decode;  // frame-based threading.
-  VPxWorker               *frame_workers;
-  int                     num_frame_workers;
-  int                     next_submit_worker_id;
-  int                     last_submit_worker_id;
-  int                     next_output_worker_id;
-  int                     available_threads;
-  cache_frame             frame_cache[FRAME_CACHE_SIZE];
-  int                     frame_cache_write;
-  int                     frame_cache_read;
-  int                     num_cache_frames;
-  int                     need_resync;      // wait for key/intra-only frame
-  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
-  BufferPool              *buffer_pool;
-
-  // External frame buffer info to save for VP9 common.
-  void *ext_priv;  // Private data associated with the external frame buffers.
-  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
-  vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
-};
-
 static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
                                     vpx_codec_priv_enc_mr_cfg_t *data) {
   // This function only allocates space for the vpx_codec_alg_priv_t
@@ -87,7 +39,8 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
   (void)data;
 
   if (!ctx->priv) {
-    vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv));
+    vpx_codec_alg_priv_t *const priv =
+        (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv));
     if (priv == NULL)
       return VPX_CODEC_MEM_ERROR;
 
@@ -183,7 +136,7 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
   si->w = si->h = 0;
 
   if (decrypt_cb) {
-    data_sz = MIN(sizeof(clear_buffer), data_sz);
+    data_sz = VPXMIN(sizeof(clear_buffer), data_sz);
     decrypt_cb(decrypt_state, data, clear_buffer, data_sz);
     data = clear_buffer;
   }
@@ -977,9 +930,9 @@ static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_INVALID_PARAM;
 }
 
-static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx,
-                                             va_list args) {
-  int *const display_size = va_arg(args, int *);
+static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const render_size = va_arg(args, int *);
 
   // Only support this function in serial decode.
   if (ctx->frame_parallel_decode) {
@@ -987,14 +940,14 @@ static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx,
     return VPX_CODEC_INCAPABLE;
   }
 
-  if (display_size) {
+  if (render_size) {
     if (ctx->frame_workers) {
       VPxWorker *const worker = ctx->frame_workers;
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
-      display_size[0] = cm->display_width;
-      display_size[1] = cm->display_height;
+      render_size[0] = cm->render_width;
+      render_size[1] = cm->render_height;
       return VPX_CODEC_OK;
     } else {
       return VPX_CODEC_ERROR;
@@ -1093,7 +1046,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   {VP8D_GET_LAST_REF_UPDATES,     ctrl_get_last_ref_updates},
   {VP8D_GET_FRAME_CORRUPTED,      ctrl_get_frame_corrupted},
   {VP9_GET_REFERENCE,             ctrl_get_reference},
-  {VP9D_GET_DISPLAY_SIZE,         ctrl_get_display_size},
+  {VP9D_GET_DISPLAY_SIZE,         ctrl_get_render_size},
   {VP9D_GET_BIT_DEPTH,            ctrl_get_bit_depth},
   {VP9D_GET_FRAME_SIZE,           ctrl_get_frame_size},
 
diff --git a/libvpx/vp9/vp9_dx_iface.h b/libvpx/vp9/vp9_dx_iface.h
new file mode 100644
index 00000000..e0e948e1
--- /dev/null
+++ b/libvpx/vp9/vp9_dx_iface.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_VP9_DX_IFACE_H_
+#define VP9_VP9_DX_IFACE_H_
+
+#include "vp9/decoder/vp9_decoder.h"
+
+typedef vpx_codec_stream_info_t vp9_stream_info_t;
+
+// This limit is due to framebuffer numbers.
+// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
+#define FRAME_CACHE_SIZE 6   // Cache maximum 6 decoded frames.
+
+typedef struct cache_frame {
+  int fb_idx;
+  vpx_image_t img;
+} cache_frame;
+
+struct vpx_codec_alg_priv {
+  vpx_codec_priv_t        base;
+  vpx_codec_dec_cfg_t     cfg;
+  vp9_stream_info_t       si;
+  int                     postproc_cfg_set;
+  vp8_postproc_cfg_t      postproc_cfg;
+  vpx_decrypt_cb          decrypt_cb;
+  void                    *decrypt_state;
+  vpx_image_t             img;
+  int                     img_avail;
+  int                     flushed;
+  int                     invert_tile_order;
+  int                     last_show_frame;  // Index of last output frame.
+  int                     byte_alignment;
+  int                     skip_loop_filter;
+
+  // Frame parallel related.
+  int                     frame_parallel_decode;  // frame-based threading.
+  VPxWorker               *frame_workers;
+  int                     num_frame_workers;
+  int                     next_submit_worker_id;
+  int                     last_submit_worker_id;
+  int                     next_output_worker_id;
+  int                     available_threads;
+  cache_frame             frame_cache[FRAME_CACHE_SIZE];
+  int                     frame_cache_write;
+  int                     frame_cache_read;
+  int                     num_cache_frames;
+  int                     need_resync;      // wait for key/intra-only frame
+  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+  BufferPool              *buffer_pool;
+
+  // External frame buffer info to save for VP9 common.
+  void *ext_priv;  // Private data associated with the external frame buffers.
+  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+};
+
+#endif  // VP9_VP9_DX_IFACE_H_
diff --git a/libvpx/vp9/vp9_iface_common.h b/libvpx/vp9/vp9_iface_common.h
index 58bb7d5d..938d4224 100644
--- a/libvpx/vp9/vp9_iface_common.h
+++ b/libvpx/vp9/vp9_iface_common.h
@@ -37,11 +37,14 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
     }
   }
   img->cs = yv12->color_space;
+  img->range = yv12->color_range;
   img->bit_depth = 8;
   img->w = yv12->y_stride;
   img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
   img->d_w = yv12->y_crop_width;
   img->d_h = yv12->y_crop_height;
+  img->r_w = yv12->render_width;
+  img->r_h = yv12->render_height;
   img->x_chroma_shift = yv12->subsampling_x;
   img->y_chroma_shift = yv12->subsampling_y;
   img->planes[VPX_PLANE_Y] = yv12->y_buffer;
@@ -56,7 +59,7 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
   if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
     // vpx_image_t uses byte strides and a pointer to the first byte
     // of the image.
-    img->fmt |= VPX_IMG_FMT_HIGHBITDEPTH;
+    img->fmt = (vpx_img_fmt_t)(img->fmt | VPX_IMG_FMT_HIGHBITDEPTH);
     img->bit_depth = yv12->bit_depth;
     img->planes[VPX_PLANE_Y] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->y_buffer);
     img->planes[VPX_PLANE_U] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->u_buffer);
@@ -83,6 +86,8 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
 
   yv12->y_crop_width  = img->d_w;
   yv12->y_crop_height = img->d_h;
+  yv12->render_width  = img->r_w;
+  yv12->render_height = img->r_h;
   yv12->y_width  = img->d_w;
   yv12->y_height = img->d_h;
 
@@ -96,6 +101,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
   yv12->y_stride = img->stride[VPX_PLANE_Y];
   yv12->uv_stride = img->stride[VPX_PLANE_U];
   yv12->color_space = img->cs;
+  yv12->color_range = img->range;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk
index 84b12d78..25a176f8 100644
--- a/libvpx/vp9/vp9cx.mk
+++ b/libvpx/vp9/vp9cx.mk
@@ -100,8 +100,13 @@ endif
 
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
+VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
+else
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
 endif
+endif
 
 ifeq ($(ARCH_X86_64),yes)
 ifeq ($(CONFIG_USE_X86INC),yes)
diff --git a/libvpx/vp9/vp9dx.mk b/libvpx/vp9/vp9dx.mk
index 0e9cf161..4c6fd007 100644
--- a/libvpx/vp9/vp9dx.mk
+++ b/libvpx/vp9/vp9dx.mk
@@ -16,6 +16,7 @@ VP9_DX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes)
 VP9_DX_SRCS_REMOVE-no  += $(VP9_COMMON_SRCS_REMOVE-no)
 
 VP9_DX_SRCS-yes += vp9_dx_iface.c
+VP9_DX_SRCS-yes += vp9_dx_iface.h
 
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.c
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c
diff --git a/libvpx/vpx/src/svc_encodeframe.c b/libvpx/vpx/src/svc_encodeframe.c
index 9844ace5..ff600830 100644
--- a/libvpx/vpx/src/svc_encodeframe.c
+++ b/libvpx/vpx/src/svc_encodeframe.c
@@ -339,7 +339,8 @@ void assign_layer_bitrates(const SvcContext *svc_ctx,
               (spatial_layer_target >> 1) + (spatial_layer_target >> 2);
           enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 2] =
               spatial_layer_target;
-        } else if (svc_ctx->temporal_layering_mode == 2) {
+        } else if (svc_ctx->temporal_layering_mode == 2 ||
+                   svc_ctx->temporal_layering_mode == 1) {
           enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers] =
               spatial_layer_target * 2 / 3;
           enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 1] =
@@ -417,7 +418,8 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   // si->svc_params.temporal_layering_mode = svc_ctx->temporal_layering_mode;
   if (svc_ctx->temporal_layering_mode == 3) {
     svc_ctx->temporal_layers = 3;
-  } else if (svc_ctx->temporal_layering_mode == 2) {
+  } else if (svc_ctx->temporal_layering_mode == 2 ||
+             svc_ctx->temporal_layering_mode == 1) {
     svc_ctx->temporal_layers = 2;
   }
 
@@ -477,10 +479,10 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   if (enc_cfg->rc_end_usage == VPX_CBR) {
     enc_cfg->rc_resize_allowed = 0;
     enc_cfg->rc_min_quantizer = 2;
-    enc_cfg->rc_max_quantizer = 63;
+    enc_cfg->rc_max_quantizer = 56;
     enc_cfg->rc_undershoot_pct = 50;
     enc_cfg->rc_overshoot_pct = 50;
-    enc_cfg->rc_buf_initial_sz = 20;
+    enc_cfg->rc_buf_initial_sz = 500;
     enc_cfg->rc_buf_optimal_sz = 600;
     enc_cfg->rc_buf_sz = 1000;
   }
@@ -494,10 +496,10 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     svc_log(svc_ctx, SVC_LOG_ERROR, "svc_enc_init error\n");
     return res;
   }
-
-  vpx_codec_control(codec_ctx, VP9E_SET_SVC, 1);
-  vpx_codec_control(codec_ctx, VP9E_SET_SVC_PARAMETERS, &si->svc_params);
-
+  if (svc_ctx->spatial_layers > 1 || svc_ctx->temporal_layers > 1) {
+    vpx_codec_control(codec_ctx, VP9E_SET_SVC, 1);
+    vpx_codec_control(codec_ctx, VP9E_SET_SVC_PARAMETERS, &si->svc_params);
+  }
   return VPX_CODEC_OK;
 }
 
diff --git a/libvpx/vpx/svc_context.h b/libvpx/vpx/svc_context.h
index a09651cc..5bc25189 100644
--- a/libvpx/vpx/svc_context.h
+++ b/libvpx/vpx/svc_context.h
@@ -40,6 +40,7 @@ typedef struct {
   int output_rc_stat;  // for outputting rc stats
   int speed;  // speed setting for codec
   int threads;
+  int aqmode;  // turns on aq-mode=3 (cyclic_refresh): 0=off, 1=on.
   // private storage for vpx_svc_encode
   void *internal;
 } SvcContext;
diff --git a/libvpx/vpx/vp8.h b/libvpx/vpx/vp8.h
index 2a31af6d..8a035f97 100644
--- a/libvpx/vpx/vp8.h
+++ b/libvpx/vpx/vp8.h
@@ -116,19 +116,29 @@ typedef struct vp9_ref_frame {
   vpx_image_t  img; /**< img structure to populate (output) */
 } vp9_ref_frame_t;
 
+/*!\cond */
 /*!\brief vp8 decoder control function parameter type
  *
  * defines the data type for each of VP8 decoder control function requires
  */
 VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE,           vpx_ref_frame_t *)
+#define VPX_CTRL_VP8_SET_REFERENCE
 VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE,          vpx_ref_frame_t *)
+#define VPX_CTRL_VP8_COPY_REFERENCE
 VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC,            vp8_postproc_cfg_t *)
+#define VPX_CTRL_VP8_SET_POSTPROC
 VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int)
+#define VPX_CTRL_VP8_SET_DBG_COLOR_REF_FRAME
 VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES,  int)
+#define VPX_CTRL_VP8_SET_DBG_COLOR_MB_MODES
 VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES,   int)
+#define VPX_CTRL_VP8_SET_DBG_COLOR_B_MODES
 VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV,      int)
+#define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV
 VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE,           vp9_ref_frame_t *)
+#define VPX_CTRL_VP9_GET_REFERENCE
 
+/*!\endcond */
 /*! @} - end defgroup vp8 */
 
 #ifdef __cplusplus
diff --git a/libvpx/vpx/vp8cx.h b/libvpx/vpx/vp8cx.h
index 31120df2..bd99c6dc 100644
--- a/libvpx/vpx/vp8cx.h
+++ b/libvpx/vpx/vp8cx.h
@@ -141,29 +141,11 @@ extern vpx_codec_iface_t *vpx_codec_vp10_cx(void);
  * \sa #vpx_codec_control
  */
 enum vp8e_enc_control_id {
-  /*!\brief Codec control function to set mode of entropy update in encoder.
-   *
-   * Supported in codecs: VP8, VP9
-   */
-  VP8E_UPD_ENTROPY           = 5,
-
-  /*!\brief Codec control function to set reference update mode in encoder.
-   *
-   * Supported in codecs: VP8, VP9
-   */
-  VP8E_UPD_REFERENCE,
-
-  /*!\brief Codec control function to set which reference frame encoder can use.
-   *
-   * Supported in codecs: VP8, VP9
-   */
-  VP8E_USE_REFERENCE,
-
   /*!\brief Codec control function to pass an ROI map to encoder.
    *
    * Supported in codecs: VP8, VP9
    */
-  VP8E_SET_ROI_MAP,
+  VP8E_SET_ROI_MAP           = 8,
 
   /*!\brief Codec control function to pass an Active map to encoder.
    *
@@ -547,6 +529,31 @@ enum vp8e_enc_control_id {
    * Supported in codecs: VP9
    */
   VP9E_GET_ACTIVEMAP,
+
+  /*!\brief Codec control function to set color range bit.
+   * \note Valid ranges: 0..1, default is 0
+   *                     0 = Limited range (16..235 or HBD equivalent)
+   *                     1 = Full range (0..255 or HBD equivalent)
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_COLOR_RANGE,
+
+  /*!\brief Codec control function to set the frame flags and buffer indices
+   * for spatial layers. The frame flags and buffer indices are set using the
+   * struct #vpx_svc_ref_frame_config defined below.
+   *
+   * Supported in codecs: VP9
+  */
+  VP9E_SET_SVC_REF_FRAME_CONFIG,
+
+  /*!\brief Codec control function to set intended rendering image size.
+   *
+   * By default, this is identical to the image size in pixels.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_RENDER_SIZE,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -673,6 +680,22 @@ typedef struct vpx_svc_layer_id {
   int temporal_layer_id;      /**< Temporal layer id number. */
 } vpx_svc_layer_id_t;
 
+/*!\brief  vp9 svc frame flag parameters.
+ *
+ * This defines the frame flags and buffer indices for each spatial layer for
+ * svc encoding.
+ * This is used with the #VP9E_SET_SVC_REF_FRAME_CONFIG control to set frame
+ * flags and buffer indices for each spatial layer for the current (super)frame.
+ *
+ */
+typedef struct vpx_svc_ref_frame_config {
+  int frame_flags[VPX_TS_MAX_LAYERS];  /**< Frame flags. */
+  int lst_fb_idx[VPX_TS_MAX_LAYERS];  /**< Last buffer index. */
+  int gld_fb_idx[VPX_TS_MAX_LAYERS];  /**< Golden buffer index. */
+  int alt_fb_idx[VPX_TS_MAX_LAYERS];  /**< Altref buffer index. */
+} vpx_svc_ref_frame_config_t;
+
+/*!\cond */
 /*!\brief VP8 encoder control function parameter type
  *
  * Defines the data types that VP8E control functions take. Note that
@@ -680,83 +703,113 @@ typedef struct vpx_svc_layer_id {
  *
  */
 
-
-/* These controls have been deprecated in favor of the flags parameter to
- * vpx_codec_encode(). See the definition of VP8_EFLAG_* above.
- */
-VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_UPD_ENTROPY,            int)
-VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_UPD_REFERENCE,          int)
-VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_USE_REFERENCE,          int)
-
 VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS,        int)
+#define VPX_CTRL_VP8E_SET_FRAME_FLAGS
 VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID,  int)
+#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID
 VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP,            vpx_roi_map_t *)
+#define VPX_CTRL_VP8E_SET_ROI_MAP
 VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP,          vpx_active_map_t *)
+#define VPX_CTRL_VP8E_SET_ACTIVEMAP
 VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE,          vpx_scaling_mode_t *)
+#define VPX_CTRL_VP8E_SET_SCALEMODE
 
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC,                int)
+#define VPX_CTRL_VP9E_SET_SVC
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS,     void *)
+#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS
 VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK,   void *)
+#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID,       vpx_svc_layer_id_t *)
+#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID
 
 VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED,            int)
+#define VPX_CTRL_VP8E_SET_CPUUSED
 VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF,   unsigned int)
+#define VPX_CTRL_VP8E_SET_ENABLEAUTOALTREF
 VPX_CTRL_USE_TYPE(VP8E_SET_NOISE_SENSITIVITY,  unsigned int)
+#define VPX_CTRL_VP8E_SET_NOISE_SENSITIVITY
 VPX_CTRL_USE_TYPE(VP8E_SET_SHARPNESS,          unsigned int)
+#define VPX_CTRL_VP8E_SET_SHARPNESS
 VPX_CTRL_USE_TYPE(VP8E_SET_STATIC_THRESHOLD,   unsigned int)
+#define VPX_CTRL_VP8E_SET_STATIC_THRESHOLD
 VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS,   int) /* vp8e_token_partitions */
+#define VPX_CTRL_VP8E_SET_TOKEN_PARTITIONS
 
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES,     unsigned int)
+#define VPX_CTRL_VP8E_SET_ARNR_MAXFRAMES
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH,     unsigned int)
+#define VPX_CTRL_VP8E_SET_ARNR_STRENGTH
 VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_SET_ARNR_TYPE,     unsigned int)
+#define VPX_CTRL_VP8E_SET_ARNR_TYPE
 VPX_CTRL_USE_TYPE(VP8E_SET_TUNING,             int) /* vp8e_tuning */
+#define VPX_CTRL_VP8E_SET_TUNING
 VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL,      unsigned int)
+#define VPX_CTRL_VP8E_SET_CQ_LEVEL
 
 VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS,  int)
+#define VPX_CTRL_VP9E_SET_TILE_COLUMNS
 VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS,  int)
+#define VPX_CTRL_VP9E_SET_TILE_ROWS
 
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER,     int *)
+#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64,  int *)
+#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64
 VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID,  vpx_svc_layer_id_t *)
+#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID
 
 VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
+#define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT
 VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
+#define VPX_CTRL_VP8E_SET_MAX_INTER_BITRATE_PCT
 
 VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
+#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE
 
 VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define VPX_CTRL_VP9E_SET_GF_CBR_BOOST_PCT
 
 VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
+#define VPX_CTRL_VP9E_SET_LOSSLESS
 
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int)
+#define VPX_CTRL_VP9E_SET_FRAME_PARALLEL_DECODING
 
 VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int)
+#define VPX_CTRL_VP9E_SET_AQ_MODE
 
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int)
+#define VPX_CTRL_VP9E_SET_FRAME_PERIODIC_BOOST
 
 VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY,  unsigned int)
+#define VPX_CTRL_VP9E_SET_NOISE_SENSITIVITY
 
 VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
+#define VPX_CTRL_VP9E_SET_TUNE_CONTENT
 
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
+#define VPX_CTRL_VP9E_SET_COLOR_SPACE
 
 VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL,  unsigned int)
-
-/*!\brief
- *
- * TODO(debargha) : add support of the control in ffmpeg
- */
 #define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL
 
-
 VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL,  unsigned int)
-/*!\brief
- *
- * TODO(debargha) : add support of the control in ffmpeg
- */
 #define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL
 
 VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)
+#define VPX_CTRL_VP9E_GET_ACTIVEMAP
+
+VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_RANGE, int)
+#define VPX_CTRL_VP9E_SET_COLOR_RANGE
+
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
+#define VPX_CTRL_VP9E_SET_SVC_REF_FRAME_CONFIG
+
+VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
+#define VPX_CTRL_VP9E_SET_RENDER_SIZE
+
+/*!\endcond */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libvpx/vpx/vp8dx.h b/libvpx/vpx/vp8dx.h
index 27b9f780..1f02fd59 100644
--- a/libvpx/vpx/vp8dx.h
+++ b/libvpx/vpx/vp8dx.h
@@ -147,6 +147,7 @@ typedef struct vpx_decrypt_init {
 typedef vpx_decrypt_init vp8_decrypt_init;
 
 
+/*!\cond */
 /*!\brief VP8 decoder control function parameter type
  *
  * Defines the data types that VP8D control functions take. Note that
@@ -156,15 +157,25 @@ typedef vpx_decrypt_init vp8_decrypt_init;
 
 
 VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES,    int *)
+#define VPX_CTRL_VP8D_GET_LAST_REF_UPDATES
 VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED,     int *)
+#define VPX_CTRL_VP8D_GET_FRAME_CORRUPTED
 VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED,       int *)
+#define VPX_CTRL_VP8D_GET_LAST_REF_USED
 VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR,           vpx_decrypt_init *)
+#define VPX_CTRL_VPXD_SET_DECRYPTOR
 VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR,           vpx_decrypt_init *)
+#define VPX_CTRL_VP8D_SET_DECRYPTOR
 VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE,        int *)
+#define VPX_CTRL_VP9D_GET_DISPLAY_SIZE
 VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH,           unsigned int *)
+#define VPX_CTRL_VP9D_GET_BIT_DEPTH
 VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE,          int *)
+#define VPX_CTRL_VP9D_GET_FRAME_SIZE
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
+#define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
 
+/*!\endcond */
 /*! @} - end defgroup vp8_decoder */
 
 #ifdef __cplusplus
diff --git a/libvpx/vpx/vpx_encoder.h b/libvpx/vpx/vpx_encoder.h
index 2b17f98a..955e8735 100644
--- a/libvpx/vpx/vpx_encoder.h
+++ b/libvpx/vpx/vpx_encoder.h
@@ -150,7 +150,7 @@ extern "C" {
   partitions can be decoded even
   though earlier partitions have
   been lost. Note that intra
-  predicition is still done over
+  prediction is still done over
   the partition boundary. */
 
   /*!\brief Encoder output packet variants
diff --git a/libvpx/vpx/vpx_image.h b/libvpx/vpx/vpx_image.h
index c06d3510..e9e952c4 100644
--- a/libvpx/vpx/vpx_image.h
+++ b/libvpx/vpx/vpx_image.h
@@ -78,10 +78,17 @@ extern "C" {
     VPX_CS_SRGB       = 7   /**< sRGB */
   } vpx_color_space_t; /**< alias for enum vpx_color_space */
 
+  /*!\brief List of supported color range */
+  typedef enum vpx_color_range {
+    VPX_CR_STUDIO_RANGE = 0,    /**< Y [16..235], UV [16..240] */
+    VPX_CR_FULL_RANGE   = 1     /**< YUV/RGB [0..255] */
+  } vpx_color_range_t; /**< alias for enum vpx_color_range */
+
   /**\brief Image Descriptor */
   typedef struct vpx_image {
     vpx_img_fmt_t fmt; /**< Image Format */
     vpx_color_space_t cs; /**< Color Space */
+    vpx_color_range_t range; /**< Color Range */
 
     /* Image storage dimensions */
     unsigned int  w;           /**< Stored image width */
@@ -92,6 +99,10 @@ extern "C" {
     unsigned int  d_w;   /**< Displayed image width */
     unsigned int  d_h;   /**< Displayed image height */
 
+    /* Image intended rendering dimensions */
+    unsigned int  r_w;   /**< Intended rendering image width */
+    unsigned int  r_h;   /**< Intended rendering image height */
+
     /* Chroma subsampling info */
     unsigned int  x_chroma_shift;   /**< subsampling order, X */
     unsigned int  y_chroma_shift;   /**< subsampling order, Y */
diff --git a/libvpx/vpx_dsp/bitreader.c b/libvpx/vpx_dsp/bitreader.c
index 4420fade..6ad806ac 100644
--- a/libvpx/vpx_dsp/bitreader.c
+++ b/libvpx/vpx_dsp/bitreader.c
@@ -13,6 +13,7 @@
 
 #include "vpx_dsp/bitreader.h"
 #include "vpx_dsp/prob.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_util/endian_inl.h"
@@ -48,7 +49,7 @@ void vpx_reader_fill(vpx_reader *r) {
   int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
 
   if (r->decrypt_cb) {
-    size_t n = MIN(sizeof(r->clear_buffer), bytes_left);
+    size_t n = VPXMIN(sizeof(r->clear_buffer), bytes_left);
     r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n);
     buffer = r->clear_buffer;
     buffer_start = r->clear_buffer;
diff --git a/libvpx/vpx_dsp/bitreader_buffer.c b/libvpx/vpx_dsp/bitreader_buffer.c
index fb04ee63..bb917263 100644
--- a/libvpx/vpx_dsp/bitreader_buffer.c
+++ b/libvpx/vpx_dsp/bitreader_buffer.c
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "./vpx_config.h"
 #include "./bitreader_buffer.h"
 
 size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb) {
@@ -39,3 +40,14 @@ int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb,
   const int value = vpx_rb_read_literal(rb, bits);
   return vpx_rb_read_bit(rb) ? -value : value;
 }
+
+int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb,
+                                   int bits) {
+#if CONFIG_MISC_FIXES
+  const int nbits = sizeof(unsigned) * 8 - bits - 1;
+  const unsigned value = vpx_rb_read_literal(rb, bits + 1) << nbits;
+  return ((int) value) >> nbits;
+#else
+  return vpx_rb_read_signed_literal(rb, bits);
+#endif
+}
diff --git a/libvpx/vpx_dsp/bitreader_buffer.h b/libvpx/vpx_dsp/bitreader_buffer.h
index 03b156ba..8a48a95e 100644
--- a/libvpx/vpx_dsp/bitreader_buffer.h
+++ b/libvpx/vpx_dsp/bitreader_buffer.h
@@ -38,6 +38,8 @@ int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits);
 
 int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
 
+int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vpx_dsp/bitwriter_buffer.c b/libvpx/vpx_dsp/bitwriter_buffer.c
index 0dfb859d..6182a722 100644
--- a/libvpx/vpx_dsp/bitwriter_buffer.c
+++ b/libvpx/vpx_dsp/bitwriter_buffer.c
@@ -9,7 +9,9 @@
  */
 
 #include <limits.h>
+#include <stdlib.h>
 
+#include "./vpx_config.h"
 #include "./bitwriter_buffer.h"
 
 size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb) {
@@ -34,3 +36,13 @@ void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) {
   for (bit = bits - 1; bit >= 0; bit--)
     vpx_wb_write_bit(wb, (data >> bit) & 1);
 }
+
+void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb,
+                                     int data, int bits) {
+#if CONFIG_MISC_FIXES
+  vpx_wb_write_literal(wb, data, bits + 1);
+#else
+  vpx_wb_write_literal(wb, abs(data), bits);
+  vpx_wb_write_bit(wb, data < 0);
+#endif
+}
diff --git a/libvpx/vpx_dsp/bitwriter_buffer.h b/libvpx/vpx_dsp/bitwriter_buffer.h
index 9397668e..a123a2fe 100644
--- a/libvpx/vpx_dsp/bitwriter_buffer.h
+++ b/libvpx/vpx_dsp/bitwriter_buffer.h
@@ -28,6 +28,8 @@ void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit);
 
 void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits);
 
+void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data,
+                                     int bits);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libvpx/vpx_dsp/intrapred.c b/libvpx/vpx_dsp/intrapred.c
index 9ba0f644..a9669e51 100644
--- a/libvpx/vpx_dsp/intrapred.c
+++ b/libvpx/vpx_dsp/intrapred.c
@@ -44,6 +44,21 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
       dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
 }
 
+static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                   const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  (void) above;
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
+                            left[(c >> 1) + r + 2])
+          : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
+    }
+    dst += stride;
+  }
+}
+
 static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -61,6 +76,20 @@ static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   }
 }
 
+static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  (void) left;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
+                            above[(r >> 1) + c + 2])
+          : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
+    }
+    dst += stride;
+  }
+}
+
 static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8_t above_right = above[bs - 1];
@@ -80,6 +109,19 @@ static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   }
 }
 
+static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  (void) left;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = AVG3(above[r + c], above[r + c + 1],
+                    above[r + c + 1 + (r + c + 2 < bs * 2)]);
+    }
+    dst += stride;
+  }
+}
+
 static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -247,6 +289,38 @@ static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   }
 }
 
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  const int H = above[-1];
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+
+  memset(dst + stride * 0, AVG3(H, I, J), 4);
+  memset(dst + stride * 1, AVG3(I, J, K), 4);
+  memset(dst + stride * 2, AVG3(J, K, L), 4);
+  memset(dst + stride * 3, AVG3(K, L, L), 4);
+}
+
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  const int H = above[-1];
+  const int I = above[0];
+  const int J = above[1];
+  const int K = above[2];
+  const int L = above[3];
+  const int M = above[4];
+
+  dst[0] = AVG3(H, I, J);
+  dst[1] = AVG3(I, J, K);
+  dst[2] = AVG3(J, K, L);
+  dst[3] = AVG3(K, L, M);
+  memcpy(dst + stride * 1, dst, 4);
+  memcpy(dst + stride * 2, dst, 4);
+  memcpy(dst + stride * 3, dst, 4);
+}
+
 void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   const int I = left[0];
@@ -287,6 +361,30 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
               DST(3, 3) = AVG3(E, F, G);  // differs from vp8
 }
 
+void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  const int E = above[4];
+  const int F = above[5];
+  const int G = above[6];
+  const int H = above[7];
+  (void)left;
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+              DST(3, 2) = AVG3(E, F, G);
+
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 3) = AVG3(F, G, H);
+}
+
 void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
                              const uint8_t *above, const uint8_t *left) {
   const int A = above[0];
@@ -308,6 +406,27 @@ void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
                                       DST(3, 3) = H;  // differs from vp8
 }
 
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  const int E = above[4];
+  const int F = above[5];
+  const int G = above[6];
+  const int H = above[7];
+  (void)stride;
+  (void)left;
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+              DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+                                      DST(3, 3) = AVG3(G, H, H);
+}
+
 void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   const int I = left[0];
@@ -409,6 +528,23 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
+static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
+                                          int bs, const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  int r, c;
+  (void) above;
+  (void) bd;
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
+                            left[(c >> 1) + r + 2])
+          : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
+    }
+    dst += stride;
+  }
+}
+
 static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride,
                                         int bs, const uint16_t *above,
                                         const uint16_t *left, int bd) {
@@ -425,6 +561,8 @@ static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
+#define highbd_d63e_predictor highbd_d63_predictor
+
 static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
@@ -441,6 +579,21 @@ static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
   }
 }
 
+static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  int r, c;
+  (void) left;
+  (void) bd;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = AVG3(above[r + c], above[r + c + 1],
+                    above[r + c + 1 + (r + c + 2 < bs * 2)]);
+    }
+    dst += stride;
+  }
+}
+
 static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
@@ -679,6 +832,11 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
 intra_pred_no_4x4(d207)
 intra_pred_no_4x4(d63)
 intra_pred_no_4x4(d45)
+#if CONFIG_MISC_FIXES
+intra_pred_allsizes(d207e)
+intra_pred_allsizes(d63e)
+intra_pred_no_4x4(d45e)
+#endif
 intra_pred_no_4x4(d117)
 intra_pred_no_4x4(d135)
 intra_pred_no_4x4(d153)
diff --git a/libvpx/vpx_dsp/inv_txfm.c b/libvpx/vpx_dsp/inv_txfm.c
index 3afa8cdc..5f3cfddb 100644
--- a/libvpx/vpx_dsp/inv_txfm.c
+++ b/libvpx/vpx_dsp/inv_txfm.c
@@ -170,16 +170,25 @@ void idct8_c(const tran_low_t *input, tran_low_t *output) {
   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
-  // stage 2 & stage 3 - even half
-  idct4_c(step1, step1);
-
-  // stage 2 - odd half
+  // stage 2
+  temp1 = (step1[0] + step1[2]) * cospi_16_64;
+  temp2 = (step1[0] - step1[2]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
   step2[4] = WRAPLOW(step1[4] + step1[5], 8);
   step2[5] = WRAPLOW(step1[4] - step1[5], 8);
   step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
   step2[7] = WRAPLOW(step1[6] + step1[7], 8);
 
-  // stage 3 -odd half
+  // stage 3
+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
diff --git a/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h b/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
index e82dfb7e..2c964afa 100644
--- a/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
+++ b/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
@@ -355,7 +355,7 @@ static INLINE void flatmask5(uint32_t p4, uint32_t p3,
       /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
       "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
 
-      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), 
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r),
         [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3)
       : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2),
         [p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1),
diff --git a/libvpx/vpx_dsp/prob.h b/libvpx/vpx_dsp/prob.h
index 729f90a5..c3cb103f 100644
--- a/libvpx/vpx_dsp/prob.h
+++ b/libvpx/vpx_dsp/prob.h
@@ -65,7 +65,7 @@ static INLINE vpx_prob merge_probs(vpx_prob pre_prob,
                                    unsigned int count_sat,
                                    unsigned int max_update_factor) {
   const vpx_prob prob = get_binary_prob(ct[0], ct[1]);
-  const unsigned int count = MIN(ct[0] + ct[1], count_sat);
+  const unsigned int count = VPXMIN(ct[0] + ct[1], count_sat);
   const unsigned int factor = max_update_factor * count / count_sat;
   return weighted_prob(pre_prob, prob, factor);
 }
@@ -82,7 +82,7 @@ static INLINE vpx_prob mode_mv_merge_probs(vpx_prob pre_prob,
   if (den == 0) {
     return pre_prob;
   } else {
-    const unsigned int count = MIN(den, MODE_MV_COUNT_SAT);
+    const unsigned int count = VPXMIN(den, MODE_MV_COUNT_SAT);
     const unsigned int factor = count_to_update_factor[count];
     const vpx_prob prob =
         clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den);
diff --git a/libvpx/vpx_dsp/psnrhvs.c b/libvpx/vpx_dsp/psnrhvs.c
index 2de77c05..30017057 100644
--- a/libvpx/vpx_dsp/psnrhvs.c
+++ b/libvpx/vpx_dsp/psnrhvs.c
@@ -191,7 +191,7 @@ static double calc_psnrhvs(const unsigned char *_src, int _systride,
       for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
           float err;
-          err = fabs(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]);
+          err = fabs((float)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]));
           if (i != 0 || j != 0)
             err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
           ret += (err * _csf[i][j]) * (err * _csf[i][j]);
diff --git a/libvpx/vpx_dsp/vpx_dsp.mk b/libvpx/vpx_dsp/vpx_dsp.mk
index 1959c4d8..9620eaa0 100644
--- a/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/libvpx/vpx_dsp/vpx_dsp.mk
@@ -36,13 +36,13 @@ DSP_SRCS-yes += bitreader_buffer.h
 endif
 
 # intra predictions
-ifneq ($(filter yes,$(CONFIG_VP9) $(CONFIG_VP10)),)
 DSP_SRCS-yes += intrapred.c
 
 ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
 endif  # CONFIG_USE_X86INC
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
@@ -58,7 +58,6 @@ DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred4_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
-endif  # CONFIG_VP9 || CONFIG_VP10
 
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.h
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c
@@ -249,7 +248,8 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
 endif
 ifeq ($(ARCH_X86_64),yes)
 ifeq ($(CONFIG_USE_X86INC),yes)
-DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
+DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3_x86_64.asm
+DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
 endif
 endif
 endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
@@ -308,6 +308,8 @@ DSP_SRCS-$(HAVE_MMX)    += x86/variance_mmx.c
 DSP_SRCS-$(HAVE_MMX)    += x86/variance_impl_mmx.asm
 DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
+DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
 
diff --git a/libvpx/vpx_dsp/vpx_dsp_common.h b/libvpx/vpx_dsp/vpx_dsp_common.h
index ccb81895..a9e180e7 100644
--- a/libvpx/vpx_dsp/vpx_dsp_common.h
+++ b/libvpx/vpx_dsp/vpx_dsp_common.h
@@ -13,14 +13,15 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
+#define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
 
 #if CONFIG_VP9_HIGHBITDEPTH
 // Note:
diff --git a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 1e56d534..b369b054 100644
--- a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -54,322 +54,401 @@ if ($opts{arch} eq "x86_64") {
 # Intra prediction
 #
 
-if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes")) {
-  add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d207_predictor_4x4/, "$ssse3_x86inc";
+add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d207e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207e_predictor_4x4/;
+
+add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45_predictor_4x4 neon/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45e_predictor_4x4/;
+
+add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63e_predictor_4x4/;
+
+add_proto qw/void vpx_d63f_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63f_predictor_4x4/;
+
+add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_h_predictor_4x4 neon dspr2 msa/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_he_predictor_4x4/;
+
+add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_4x4/;
+
+add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_4x4 neon/;
+
+add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_4x4 neon msa/, "$sse_x86inc";
+
+add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_ve_predictor_4x4/;
 
-  add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d45_predictor_4x4 neon/, "$ssse3_x86inc";
+add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc";
 
-  add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d63_predictor_4x4/, "$ssse3_x86inc";
+add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
 
-  add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_h_predictor_4x4 neon dspr2 msa/, "$ssse3_x86inc";
+add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_4x4 msa neon/, "$sse_x86inc";
 
-  add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d117_predictor_4x4/;
+add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_4x4 msa neon/, "$sse_x86inc";
 
-  add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d135_predictor_4x4 neon/;
+add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_4x4 msa neon/, "$sse_x86inc";
 
-  add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d153_predictor_4x4/, "$ssse3_x86inc";
+add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_8x8/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_v_predictor_4x4 neon msa/, "$sse_x86inc";
+add_proto qw/void vpx_d207e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207e_predictor_8x8/;
 
-  add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc";
+add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45_predictor_8x8 neon/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
+add_proto qw/void vpx_d45e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45e_predictor_8x8/;
 
-  add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_top_predictor_4x4 msa neon/, "$sse_x86inc";
+add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_8x8/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_left_predictor_4x4 msa neon/, "$sse_x86inc";
+add_proto qw/void vpx_d63e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63e_predictor_8x8/;
 
-  add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_128_predictor_4x4 msa neon/, "$sse_x86inc";
+add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_h_predictor_8x8 neon dspr2 msa/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d207_predictor_8x8/, "$ssse3_x86inc";
+add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_8x8/;
 
-  add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d45_predictor_8x8 neon/, "$ssse3_x86inc";
+add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_8x8/;
 
-  add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d63_predictor_8x8/, "$ssse3_x86inc";
+add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_8x8/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_h_predictor_8x8 neon dspr2 msa/, "$ssse3_x86inc";
+add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_8x8 neon msa/, "$sse_x86inc";
 
-  add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d117_predictor_8x8/;
+add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
 
-  add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d135_predictor_8x8/;
+add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa/, "$sse_x86inc";
 
-  add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d153_predictor_8x8/, "$ssse3_x86inc";
+add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_8x8 neon msa/, "$sse_x86inc";
 
-  add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_v_predictor_8x8 neon msa/, "$sse_x86inc";
+add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_8x8 neon msa/, "$sse_x86inc";
 
-  add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
+add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_8x8 neon msa/, "$sse_x86inc";
 
-  add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa/, "$sse_x86inc";
+add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_16x16/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_top_predictor_8x8 neon msa/, "$sse_x86inc";
+add_proto qw/void vpx_d207e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207e_predictor_16x16/;
 
-  add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_left_predictor_8x8 neon msa/, "$sse_x86inc";
+add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45_predictor_16x16 neon/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_128_predictor_8x8 neon msa/, "$sse_x86inc";
+add_proto qw/void vpx_d45e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45e_predictor_16x16/;
 
-  add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d207_predictor_16x16/, "$ssse3_x86inc";
+add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_16x16/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d45_predictor_16x16 neon/, "$ssse3_x86inc";
+add_proto qw/void vpx_d63e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63e_predictor_16x16/;
 
-  add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d63_predictor_16x16/, "$ssse3_x86inc";
+add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_h_predictor_16x16 neon dspr2 msa/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_h_predictor_16x16 neon dspr2 msa/, "$ssse3_x86inc";
+add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_16x16/;
 
-  add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d117_predictor_16x16/;
+add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_16x16/;
 
-  add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d135_predictor_16x16/;
+add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_16x16/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d153_predictor_16x16/, "$ssse3_x86inc";
+add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_16x16 neon msa/, "$sse2_x86inc";
 
-  add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_v_predictor_16x16 neon msa/, "$sse2_x86inc";
+add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_tm_predictor_16x16 neon msa/, "$sse2_x86inc";
 
-  add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_tm_predictor_16x16 neon msa/, "$sse2_x86inc";
+add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa/, "$sse2_x86inc";
 
-  add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa/, "$sse2_x86inc";
+add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_16x16 neon msa/, "$sse2_x86inc";
 
-  add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_top_predictor_16x16 neon msa/, "$sse2_x86inc";
+add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_16x16 neon msa/, "$sse2_x86inc";
 
-  add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_left_predictor_16x16 neon msa/, "$sse2_x86inc";
+add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_16x16 neon msa/, "$sse2_x86inc";
 
-  add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_128_predictor_16x16 neon msa/, "$sse2_x86inc";
+add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_32x32/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d207_predictor_32x32/, "$ssse3_x86inc";
+add_proto qw/void vpx_d207e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207e_predictor_32x32/;
 
-  add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d45_predictor_32x32/, "$ssse3_x86inc";
+add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45_predictor_32x32/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d63_predictor_32x32/, "$ssse3_x86inc";
+add_proto qw/void vpx_d45e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45e_predictor_32x32/;
 
-  add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_h_predictor_32x32 neon msa/, "$ssse3_x86inc";
+add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_32x32/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d117_predictor_32x32/;
+add_proto qw/void vpx_d63e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63e_predictor_32x32/;
 
-  add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d135_predictor_32x32/;
+add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_h_predictor_32x32 neon msa/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_d153_predictor_32x32/, "$ssse3_x86inc";
+add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_32x32/;
 
-  add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_v_predictor_32x32 neon msa/, "$sse2_x86inc";
+add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_32x32/;
 
-  add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86_64_x86inc";
+add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_32x32/, "$ssse3_x86inc";
 
-  add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_predictor_32x32 msa neon/, "$sse2_x86inc";
+add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_32x32 neon msa/, "$sse2_x86inc";
 
-  add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_top_predictor_32x32 msa neon/, "$sse2_x86inc";
+add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86_64_x86inc";
 
-  add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_left_predictor_32x32 msa neon/, "$sse2_x86inc";
+add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_predictor_32x32 msa neon/, "$sse2_x86inc";
 
-  add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-  specialize qw/vpx_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc";
+add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_32x32 msa neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_32x32 msa neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc";
 
 # High bitdepth functions
-  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d207_predictor_4x4/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_4x4/;
 
-    add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d45_predictor_4x4/;
+  add_proto qw/void vpx_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207e_predictor_4x4/;
 
-    add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d63_predictor_4x4/;
+  add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_4x4/;
 
-    add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_h_predictor_4x4/;
+  add_proto qw/void vpx_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45e_predictor_4x4/;
 
-    add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d117_predictor_4x4/;
+  add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_4x4/;
 
-    add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d135_predictor_4x4/;
+  add_proto qw/void vpx_highbd_d63e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63e_predictor_4x4/;
 
-    add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d153_predictor_4x4/;
+  add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_4x4/;
 
-    add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_v_predictor_4x4/, "$sse_x86inc";
+  add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_4x4/;
 
-    add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse_x86inc";
+  add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_4x4/;
 
-    add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse_x86inc";
+  add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_4x4/;
 
-    add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_top_predictor_4x4/;
+  add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_v_predictor_4x4/, "$sse_x86inc";
 
-    add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_left_predictor_4x4/;
+  add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse_x86inc";
 
-    add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_128_predictor_4x4/;
+  add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse_x86inc";
 
-    add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d207_predictor_8x8/;
+  add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_4x4/;
 
-    add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d45_predictor_8x8/;
+  add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_4x4/;
 
-    add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d63_predictor_8x8/;
+  add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_4x4/;
 
-    add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_h_predictor_8x8/;
+  add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_8x8/;
 
-    add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d117_predictor_8x8/;
+  add_proto qw/void vpx_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207e_predictor_8x8/;
 
-    add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d135_predictor_8x8/;
+  add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_8x8/;
 
-    add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d153_predictor_8x8/;
+  add_proto qw/void vpx_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45e_predictor_8x8/;
 
-    add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_v_predictor_8x8/, "$sse2_x86inc";
+  add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_8x8/;
 
-    add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_tm_predictor_8x8/, "$sse2_x86inc";
+  add_proto qw/void vpx_highbd_d63e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63e_predictor_8x8/;
 
-    add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_predictor_8x8/, "$sse2_x86inc";;
+  add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_8x8/;
 
-    add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_top_predictor_8x8/;
+  add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_8x8/;
 
-    add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_left_predictor_8x8/;
+  add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_8x8/;
 
-    add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_128_predictor_8x8/;
+  add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_8x8/;
 
-    add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d207_predictor_16x16/;
+  add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_v_predictor_8x8/, "$sse2_x86inc";
 
-    add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d45_predictor_16x16/;
+  add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_tm_predictor_8x8/, "$sse2_x86inc";
 
-    add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d63_predictor_16x16/;
+  add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_predictor_8x8/, "$sse2_x86inc";;
 
-    add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_h_predictor_16x16/;
+  add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_8x8/;
 
-    add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d117_predictor_16x16/;
+  add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_8x8/;
 
-    add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d135_predictor_16x16/;
+  add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_8x8/;
 
-    add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d153_predictor_16x16/;
+  add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_16x16/;
 
-    add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_v_predictor_16x16/, "$sse2_x86inc";
+  add_proto qw/void vpx_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207e_predictor_16x16/;
 
-    add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86_64_x86inc";
+  add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_16x16/;
 
-    add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_predictor_16x16/, "$sse2_x86inc";
+  add_proto qw/void vpx_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45e_predictor_16x16/;
 
-    add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_top_predictor_16x16/;
+  add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_16x16/;
 
-    add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_left_predictor_16x16/;
+  add_proto qw/void vpx_highbd_d63e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63e_predictor_16x16/;
 
-    add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_128_predictor_16x16/;
+  add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_16x16/;
 
-    add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d207_predictor_32x32/;
+  add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_16x16/;
 
-    add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d45_predictor_32x32/;
+  add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_16x16/;
 
-    add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d63_predictor_32x32/;
+  add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_16x16/;
 
-    add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_h_predictor_32x32/;
+  add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_v_predictor_16x16/, "$sse2_x86inc";
 
-    add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d117_predictor_32x32/;
+  add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86_64_x86inc";
 
-    add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d135_predictor_32x32/;
+  add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_predictor_16x16/, "$sse2_x86inc";
 
-    add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_d153_predictor_32x32/;
+  add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_16x16/;
 
-    add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_v_predictor_32x32/, "$sse2_x86inc";
+  add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_16x16/;
 
-    add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86_64_x86inc";
+  add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_16x16/;
 
-    add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86_64_x86inc";
+  add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_32x32/;
 
-    add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_top_predictor_32x32/;
+  add_proto qw/void vpx_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207e_predictor_32x32/;
 
-    add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_left_predictor_32x32/;
+  add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_32x32/;
 
-    add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    specialize qw/vpx_highbd_dc_128_predictor_32x32/;
-  }  # CONFIG_VP9_HIGHBITDEPTH
-}  # CONFIG_VP9 || CONFIG_VP10
+  add_proto qw/void vpx_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45e_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_d63e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63e_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_v_predictor_32x32/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86_64_x86inc";
+
+  add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86_64_x86inc";
+
+  add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_32x32/;
+}  # CONFIG_VP9_HIGHBITDEPTH
 
 #
 # Sub Pixel Filters
@@ -421,10 +500,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Sub Pixel Filters
   #
   add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_copy/;
+  specialize qw/vpx_highbd_convolve_copy/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_avg/;
+  specialize qw/vpx_highbd_convolve_avg/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64";
@@ -616,39 +695,6 @@ if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes"))
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Note as optimized versions of these functions are added we need to add a check to ensure
   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_idct4x4_1_add/;
-
-  add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_idct4x4_16_add/;
-
-  add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_idct8x8_1_add/;
-
-  add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_idct8x8_64_add/;
-
-  add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_idct8x8_12_add/;
-
-  add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_idct16x16_1_add/;
-
-  add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_idct16x16_256_add/;
-
-  add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_idct16x16_10_add/;
-
-  add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_idct32x32_1024_add/;
-
-  add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_idct32x32_34_add/;
-
-  add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_idct32x32_1_add/;
-
   add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
   specialize qw/vpx_iwht4x4_1_add/;
 
@@ -681,6 +727,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   # Force C versions if CONFIG_EMULATE_HARDWARE is 1
   if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_16_add/;
+
+    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_1_add/;
+
+    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_64_add/;
+
+    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_12_add/;
+
+    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_1_add/;
+
+    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_256_add/;
+
+    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_10_add/;
+
+    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_1_add/;
+
+    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1024_add/;
+
+    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_34_add/;
+
+    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1_add/;
+    
     add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct4x4_16_add/;
 
@@ -696,6 +775,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct16x16_10_add/;
   } else {
+    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_16_add sse2/;
+
+    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_1_add sse2/;
+
+    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_64_add sse2/;
+
+    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_12_add sse2/;
+
+    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_1_add sse2/;
+
+    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_256_add sse2/;
+
+    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_10_add sse2/;
+
+    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_1_add sse2/;
+
+    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1024_add sse2/;
+
+    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_34_add sse2/;
+
+    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1_add sse2/;
+
     add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct4x4_16_add sse2/;
 
@@ -801,25 +913,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 # Quantization
 #
 if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b/;
+  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32/;
-
-  add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_highbd_quantize_b sse2/;
+  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
 
-  add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
-} else {
-  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc";
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vpx_highbd_quantize_b sse2/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
-}  # CONFIG_VP9_HIGHBITDEPTH
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
+  }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
@@ -1373,13 +1479,13 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i
 # Specialty Subpixel
 #
 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_h mmx media/;
+  specialize qw/vpx_variance_halfpixvar16x16_h mmx sse2 media/;
 
 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_v mmx media/;
+  specialize qw/vpx_variance_halfpixvar16x16_v mmx sse2 media/;
 
 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_hv mmx media/;
+  specialize qw/vpx_variance_halfpixvar16x16_hv mmx sse2 media/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
diff --git a/libvpx/vpx_dsp/x86/halfpix_variance_impl_sse2.asm b/libvpx/vpx_dsp/x86/halfpix_variance_impl_sse2.asm
new file mode 100644
index 00000000..cc26bb61
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/halfpix_variance_impl_sse2.asm
@@ -0,0 +1,346 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vpx_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
+;                                            int ref_stride,
+;                                            unsigned char *src,
+;                                            int src_stride,
+;                                            unsigned int height,
+;                                            int *sum,
+;                                            unsigned int *sumsquared)
+global sym(vpx_half_horiz_vert_variance16x_h_sse2) PRIVATE
+sym(vpx_half_horiz_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref
+
+        mov             rdi,            arg(2) ;src
+        movsxd          rcx,            dword ptr arg(4) ;height
+        movsxd          rax,            dword ptr arg(1) ;ref_stride
+        movsxd          rdx,            dword ptr arg(3)    ;src_stride
+
+        pxor            xmm0,           xmm0                ;
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+        lea             rsi,            [rsi + rax]
+
+vpx_half_horiz_vert_variance16x_h_1:
+        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+
+        movq            xmm3,           QWORD PTR [rdi+8]
+        punpcklbw       xmm3,           xmm0
+        psubw           xmm4,           xmm3
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vpx_half_horiz_vert_variance16x_h_1     ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vpx_half_vert_variance16x_h_sse2(unsigned char *ref,
+;                                      int ref_stride,
+;                                      unsigned char *src,
+;                                      int src_stride,
+;                                      unsigned int height,
+;                                      int *sum,
+;                                      unsigned int *sumsquared)
+global sym(vpx_half_vert_variance16x_h_sse2) PRIVATE
+sym(vpx_half_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0)              ;ref
+
+        mov             rdi,            arg(2)              ;src
+        movsxd          rcx,            dword ptr arg(4)    ;height
+        movsxd          rax,            dword ptr arg(1)    ;ref_stride
+        movsxd          rdx,            dword ptr arg(3)    ;src_stride
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        lea             rsi,            [rsi + rax          ]
+        pxor            xmm0,           xmm0
+
+vpx_half_vert_variance16x_h_1:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm2,           QWORD PTR [rdi]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm5,           xmm2
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm4,           xmm2
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm3
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1
+        jnz             vpx_half_vert_variance16x_h_1
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vpx_half_horiz_variance16x_h_sse2(unsigned char *ref,
+;                                       int ref_stride
+;                                       unsigned char *src,
+;                                       int src_stride,
+;                                       unsigned int height,
+;                                       int *sum,
+;                                       unsigned int *sumsquared)
+global sym(vpx_half_horiz_variance16x_h_sse2) PRIVATE
+sym(vpx_half_horiz_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref
+
+        mov             rdi,            arg(2) ;src
+        movsxd          rcx,            dword ptr arg(4) ;height
+        movsxd          rax,            dword ptr arg(1) ;ref_stride
+        movsxd          rdx,            dword ptr arg(3)    ;src_stride
+
+        pxor            xmm0,           xmm0                ;
+
+vpx_half_horiz_variance16x_h_1:
+        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm1,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm1,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        psubw           xmm1,           xmm2
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm1
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm1,           xmm1
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm1
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vpx_half_horiz_variance16x_h_1        ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+    times 8 dw 64
+align 16
+vpx_bilinear_filters_sse2:
+    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
+    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
diff --git a/libvpx/vpx_dsp/x86/halfpix_variance_sse2.c b/libvpx/vpx_dsp/x86/halfpix_variance_sse2.c
new file mode 100644
index 00000000..5782155b
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/halfpix_variance_sse2.c
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
+                                            int ref_stride,
+                                            const unsigned char *src,
+                                            int src_stride,
+                                            unsigned int height,
+                                            int *sum,
+                                            unsigned int *sumsquared);
+void vpx_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
+                                       const unsigned char *src, int src_stride,
+                                       unsigned int height, int *sum,
+                                       unsigned int *sumsquared);
+void vpx_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
+                                      const unsigned char *src, int src_stride,
+                                      unsigned int height, int *sum,
+                                      unsigned int *sumsquared);
+
+uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
+                                             int src_stride,
+                                             const unsigned char *dst,
+                                             int dst_stride,
+                                             uint32_t *sse) {
+  int xsum0;
+  unsigned int xxsum0;
+
+  vpx_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
+                                    &xsum0, &xxsum0);
+
+  *sse = xxsum0;
+  return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+}
+
+uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
+                                             int src_stride,
+                                             const unsigned char *dst,
+                                             int dst_stride,
+                                             uint32_t *sse) {
+  int xsum0;
+  unsigned int xxsum0;
+  vpx_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
+                                   &xsum0, &xxsum0);
+
+  *sse = xxsum0;
+  return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+}
+
+
+uint32_t vpx_variance_halfpixvar16x16_hv_sse2(const unsigned char *src,
+                                              int src_stride,
+                                              const unsigned char *dst,
+                                              int dst_stride,
+                                              uint32_t *sse) {
+  int xsum0;
+  unsigned int xxsum0;
+
+  vpx_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
+                                         &xsum0, &xxsum0);
+
+  *sse = xxsum0;
+  return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+}
diff --git a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
index f3af68f0..ae907fd0 100644
--- a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -21,7 +21,8 @@
   *(int *)(dest) = _mm_cvtsi128_si32(d0); \
 }
 
-void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
   const __m128i cst = _mm_setr_epi16(
@@ -32,8 +33,8 @@ void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i input0, input1, input2, input3;
 
   // Rows
-  input0 = _mm_load_si128((const __m128i *)input);
-  input2 = _mm_load_si128((const __m128i *)(input + 8));
+  input0 = load_input_data(input);
+  input2 = load_input_data(input + 8);
 
   // Construct i3, i1, i3, i1, i2, i0, i2, i0
   input0 = _mm_shufflelo_epi16(input0, 0xd8);
@@ -151,7 +152,8 @@ void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
-void vpx_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a;
@@ -449,7 +451,8 @@ void iadst4_sse2(__m128i *in) {
   out7 = _mm_subs_epi16(stp1_0, stp2_7); \
   }
 
-void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
@@ -469,14 +472,14 @@ void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   int i;
 
   // Load input data.
-  in0 = _mm_load_si128((const __m128i *)input);
-  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
-  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
-  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+  in0 = load_input_data(input);
+  in1 = load_input_data(input + 8 * 1);
+  in2 = load_input_data(input + 8 * 2);
+  in3 = load_input_data(input + 8 * 3);
+  in4 = load_input_data(input + 8 * 4);
+  in5 = load_input_data(input + 8 * 5);
+  in6 = load_input_data(input + 8 * 6);
+  in7 = load_input_data(input + 8 * 7);
 
   // 2-D
   for (i = 0; i < 2; i++) {
@@ -518,7 +521,8 @@ void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest + 7 * stride, in7);
 }
 
-void vpx_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a;
@@ -792,7 +796,8 @@ void iadst8_sse2(__m128i *in) {
   in[7] = _mm_sub_epi16(k__const_0, s1);
 }
 
-void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
@@ -812,10 +817,10 @@ void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
   // Rows. Load 4-row input data.
-  in0 = _mm_load_si128((const __m128i *)input);
-  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in0 = load_input_data(input);
+  in1 = load_input_data(input + 8 * 1);
+  in2 = load_input_data(input + 8 * 2);
+  in3 = load_input_data(input + 8 * 3);
 
   // 8x4 Transpose
   TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
@@ -1169,7 +1174,7 @@ void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
                              stp2_10, stp2_13, stp2_11, stp2_12) \
     }
 
-void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
                                 int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
@@ -1214,22 +1219,22 @@ void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
     // 1-D idct
 
     // Load input data.
-    in[0] = _mm_load_si128((const __m128i *)input);
-    in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
-    in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-    in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
-    in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-    in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
-    in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-    in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
-    in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
-    in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
-    in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
-    in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
-    in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
-    in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
-    in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
-    in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
+    in[0] = load_input_data(input);
+    in[8] = load_input_data(input + 8 * 1);
+    in[1] = load_input_data(input + 8 * 2);
+    in[9] = load_input_data(input + 8 * 3);
+    in[2] = load_input_data(input + 8 * 4);
+    in[10] = load_input_data(input + 8 * 5);
+    in[3] = load_input_data(input + 8 * 6);
+    in[11] = load_input_data(input + 8 * 7);
+    in[4] = load_input_data(input + 8 * 8);
+    in[12] = load_input_data(input + 8 * 9);
+    in[5] = load_input_data(input + 8 * 10);
+    in[13] = load_input_data(input + 8 * 11);
+    in[6] = load_input_data(input + 8 * 12);
+    in[14] = load_input_data(input + 8 * 13);
+    in[7] = load_input_data(input + 8 * 14);
+    in[15] = load_input_data(input + 8 * 15);
 
     array_transpose_8x8(in, in);
     array_transpose_8x8(in + 8, in + 8);
@@ -1294,7 +1299,8 @@ void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
   }
 }
 
-void vpx_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a, i;
@@ -2152,7 +2158,7 @@ void iadst16_sse2(__m128i *in0, __m128i *in1) {
   iadst16_8col(in1);
 }
 
-void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
                                int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
@@ -2184,10 +2190,10 @@ void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
   int i;
   // First 1-D inverse DCT
   // Load input data.
-  in[0] = _mm_load_si128((const __m128i *)input);
-  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8 * 2);
+  in[2] = load_input_data(input + 8 * 4);
+  in[3] = load_input_data(input + 8 * 6);
 
   TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
 
@@ -2391,7 +2397,7 @@ void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
 #define LOAD_DQCOEFF(reg, input) \
   {  \
-    reg = _mm_load_si128((const __m128i *) input); \
+    reg = load_input_data(input); \
     input += 8; \
   }  \
 
@@ -3029,7 +3035,7 @@ void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 }
 
 // Only upper-left 8x8 has non-zero coeff
-void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
                                int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
@@ -3081,14 +3087,14 @@ void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
   int i;
 
   // Load input data. Only need to load the top left 8x8 block.
-  in[0] = _mm_load_si128((const __m128i *)input);
-  in[1] = _mm_load_si128((const __m128i *)(input + 32));
-  in[2] = _mm_load_si128((const __m128i *)(input + 64));
-  in[3] = _mm_load_si128((const __m128i *)(input + 96));
-  in[4] = _mm_load_si128((const __m128i *)(input + 128));
-  in[5] = _mm_load_si128((const __m128i *)(input + 160));
-  in[6] = _mm_load_si128((const __m128i *)(input + 192));
-  in[7] = _mm_load_si128((const __m128i *)(input + 224));
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 32);
+  in[2] = load_input_data(input + 64);
+  in[3] = load_input_data(input + 96);
+  in[4] = load_input_data(input + 128);
+  in[5] = load_input_data(input + 160);
+  in[6] = load_input_data(input + 192);
+  in[7] = load_input_data(input + 224);
 
   for (i = 8; i < 32; ++i) {
     in[i] = _mm_setzero_si128();
@@ -3188,7 +3194,7 @@ void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
   }
 }
 
-void vpx_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
                                  int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
@@ -3464,10 +3470,11 @@ void vpx_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
   }
 }
 
-void vpx_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
-  int a, i;
+  int a, j;
 
   a = dct_const_round_shift(input[0] * cospi_16_64);
   a = dct_const_round_shift(a * cospi_16_64);
@@ -3475,12 +3482,11 @@ void vpx_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 
   dc_value = _mm_set1_epi16(a);
 
-  for (i = 0; i < 4; ++i) {
-    int j;
-    for (j = 0; j < 32; ++j) {
-      RECON_AND_STORE(dest + j * stride, dc_value);
-    }
-    dest += 8;
+  for (j = 0; j < 32; ++j) {
+    RECON_AND_STORE(dest +  0 + j * stride, dc_value);
+    RECON_AND_STORE(dest +  8 + j * stride, dc_value);
+    RECON_AND_STORE(dest + 16 + j * stride, dc_value);
+    RECON_AND_STORE(dest + 24 + j * stride, dc_value);
   }
 }
 
diff --git a/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
index 658a9148..bd520c18 100644
--- a/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
@@ -15,6 +15,7 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
 
 // perform 8x8 transpose
 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
@@ -89,24 +90,35 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
   res0[15] = tbuf[7];
 }
 
-static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
-  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
-  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
-  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
-  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
-  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
-  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
-  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
-  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
-
-  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
-  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
-  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
-  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
-  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
-  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
-  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
-  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
+// Function to allow 8 bit optimisations to be used when profile 0 is used with
+// highbitdepth enabled
+static INLINE __m128i load_input_data(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
+      data[6], data[7]);
+#else
+  return _mm_load_si128((const __m128i *)data);
+#endif
+}
+
+static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
+  in[0]  = load_input_data(input + 0 * 16);
+  in[1]  = load_input_data(input + 1 * 16);
+  in[2]  = load_input_data(input + 2 * 16);
+  in[3]  = load_input_data(input + 3 * 16);
+  in[4]  = load_input_data(input + 4 * 16);
+  in[5]  = load_input_data(input + 5 * 16);
+  in[6]  = load_input_data(input + 6 * 16);
+  in[7]  = load_input_data(input + 7 * 16);
+
+  in[8]  = load_input_data(input + 8 * 16);
+  in[9]  = load_input_data(input + 9 * 16);
+  in[10]  = load_input_data(input + 10 * 16);
+  in[11]  = load_input_data(input + 11 * 16);
+  in[12]  = load_input_data(input + 12 * 16);
+  in[13]  = load_input_data(input + 13 * 16);
+  in[14]  = load_input_data(input + 14 * 16);
+  in[15]  = load_input_data(input + 15 * 16);
 }
 
 #define RECON_AND_STORE(dest, in_x) \
diff --git a/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm b/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm
new file mode 100644
index 00000000..01c41291
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm
@@ -0,0 +1,544 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+
+  vzeroupper
+
+  ; If we can skip this block, then just zero the output
+  cmp                         skipmp, 0
+  jne .blank
+
+%ifnidn %1, b_32x32
+
+  ; Special case for ncoeff == 16, as it is frequent and we can save on
+  ; not setting up a loop.
+  cmp                       ncoeffmp, 16
+  jne .generic
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Special case of ncoeff == 16
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.single:
+
+  movifnidn                   coeffq, coeffmp
+  movifnidn                    zbinq, zbinmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+
+  ; Get DC and first 15 AC coeffs - in this special case, that is all.
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
+  mova                            m9, [coeffq]
+  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
+  mova                           m10, [coeffq+32]
+  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
+%else
+  mova                            m9, [coeffq]             ; m9 = c[i]
+  mova                           m10, [coeffq+16]          ; m10 = c[i]
+%endif
+
+  mov                             r0, eobmp                ; Output pointer
+  mov                             r1, qcoeffmp             ; Output pointer
+  mov                             r2, dqcoeffmp            ; Output pointer
+
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  pcmpeqw                         m4, m4                   ; All word lanes -1
+  paddw                           m0, m4                   ; m0 = zbin - 1
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, we just write zeros
+  ; to the outputs and we are done.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .single_nonzero
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova                       [r1   ], ymm5
+  mova                       [r1+32], ymm5
+  mova                       [r2   ], ymm5
+  mova                       [r2+32], ymm5
+%else
+  mova                          [r1], ymm5
+  mova                          [r2], ymm5
+%endif
+  mov                           [r0], word 0
+
+  vzeroupper
+  RET
+
+.single_nonzero:
+
+  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
+  movifnidn                       r4, roundmp
+  movifnidn                       r5, quantmp
+  mov                             r3, dequantmp
+  mov                             r6, shiftmp
+  mova                            m1, [r4]              ; m1 = round
+  mova                            m2, [r5]              ; m2 = quant
+  mova                            m3, [r3]              ; m3 = dequant
+  mova                            m4, [r6]              ; m4 = shift
+
+  mov                             r3, iscanmp
+
+  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova                  [qcoeffq   ], m11
+  mova                  [qcoeffq+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova                  [qcoeffq+32], m11
+  mova                  [qcoeffq+48], m6
+%else
+  mova                  [qcoeffq   ], m8
+  mova                  [qcoeffq+16], m13
+%endif
+
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova                 [dqcoeffq   ], m11
+  mova                 [dqcoeffq+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova                 [dqcoeffq+32], m11
+  mova                 [dqcoeffq+48], m6
+%else
+  mova                 [dqcoeffq   ], m8
+  mova                 [dqcoeffq+16], m13
+%endif
+
+  mova                            m6, [iscanq]            ; m6 = scan[i]
+  mova                           m11, [iscanq+16]         ; m11 = scan[i]
+
+  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
+  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
+  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
+  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
+  pandn                           m8,  m8,  m6            ; m8 = max(eob)
+  pandn                          m13, m13, m11            ; m13 = max(eob)
+  pmaxsw                          m8,  m8, m13
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                         [eobq], ax
+
+  vzeroupper
+  RET
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Generic case of ncoeff != 16
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.generic:
+
+%endif ; %ifnidn %1, b_32x32
+
+DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
+            qcoeff, dqcoeff, dequant, eob, scan, iscan
+
+  ; Actual quantization loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+  mova                            m3, [r2]                 ; m3 = dequant
+  pcmpeqw                         m4, m4                   ; All lanes -1
+%ifidn %1, b_32x32
+  psubw                           m0, m4
+  psubw                           m1, m4
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  paddw                           m0, m4                   ; m0 = m0 + 1
+
+  mov                             r2, shiftmp
+  mov                             r3, qcoeffmp
+  mova                            m4, [r2]                 ; m4 = shift
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea                         coeffq, [  coeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+%else
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+%endif
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; coeff stored as 32bit numbers & require 16bit numbers
+  mova                            m9, [coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [coeffq+ncoeffq*4+16]
+  mova                           m10, [coeffq+ncoeffq*4+32]
+  packssdw                       m10, [coeffq+ncoeffq*4+48]
+%else
+  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .first_nonzero
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova        [qcoeffq+ncoeffq*4   ], ymm5
+  mova        [qcoeffq+ncoeffq*4+32], ymm5
+  mova       [dqcoeffq+ncoeffq*4   ], ymm5
+  mova       [dqcoeffq+ncoeffq*4+32], ymm5
+%else
+  mova           [qcoeffq+ncoeffq*2], ymm5
+  mova          [dqcoeffq+ncoeffq*2], ymm5
+%endif
+
+  add                        ncoeffq, mmsize
+
+  punpckhqdq                      m1, m1
+  punpckhqdq                      m2, m2
+  punpckhqdq                      m3, m3
+  punpckhqdq                      m4, m4
+  pxor                            m8, m8
+
+  jmp .ac_only_loop
+
+.first_nonzero:
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+%else
+  mova        [qcoeffq+ncoeffq*2+ 0], m8
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%endif
+
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+%else
+  mova       [dqcoeffq+ncoeffq*2+ 0], m8
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+%endif
+
+  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
+  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
+  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                    ; m6 = scan[i] + 1
+  psubw                          m11, m12                   ; m11 = scan[i] + 1
+  pandn                           m8, m6                    ; m8 = max(eob)
+  pandn                          m13, m11                   ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+
+.ac_only_loop:
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; pack coeff from 32bit to 16bit array
+  mova                            m9, [coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [coeffq+ncoeffq*4+16]
+  mova                           m10, [coeffq+ncoeffq*4+32]
+  packssdw                       m10, [coeffq+ncoeffq*4+48]
+%else
+  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
+  ; And just write zeros as the result would be.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .rest_nonzero
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
+  mova        [qcoeffq+ncoeffq*4+32], ymm5
+  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
+  mova       [dqcoeffq+ncoeffq*4+32], ymm5
+%else
+  mova        [qcoeffq+ncoeffq*2+ 0], ymm5
+  mova       [dqcoeffq+ncoeffq*2+ 0], ymm5
+%endif
+  add                        ncoeffq, mmsize
+  jnz .ac_only_loop
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                           [r2], ax
+  vzeroupper
+  RET
+
+.rest_nonzero:
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m14
+  punpckhwd                       m6, m14, m6
+  pmovsxwd                       m11, m14
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+%else
+  mova        [qcoeffq+ncoeffq*2+ 0], m14
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%endif
+
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m14
+  punpckhwd                       m6, m14, m6
+  pmovsxwd                       m11, m14
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+%else
+  mova       [dqcoeffq+ncoeffq*2+ 0], m14
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+%endif
+
+  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
+  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                    ; m6 = scan[i] + 1
+  psubw                          m11, m12                   ; m11 = scan[i] + 1
+  pandn                          m14, m6                    ; m14 = max(eob)
+  pandn                          m13, m11                   ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jnz .ac_only_loop
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                           [r2], ax
+  vzeroupper
+  RET
+
+  ; Skip-block, i.e. just write all zeroes
+.blank:
+
+DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
+            qcoeff, dqcoeff, dequant, eob, scan, iscan
+
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+
+DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+%else
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+%endif
+
+  neg                        ncoeffq
+  pxor                            m7, m7
+
+.blank_loop:
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova       [dqcoeffq+ncoeffq*4+ 0], ymm7
+  mova       [dqcoeffq+ncoeffq*4+32], ymm7
+  mova        [qcoeffq+ncoeffq*4+ 0], ymm7
+  mova        [qcoeffq+ncoeffq*4+32], ymm7
+%else
+  mova       [dqcoeffq+ncoeffq*2+ 0], ymm7
+  mova        [qcoeffq+ncoeffq*2+ 0], ymm7
+%endif
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+
+  mov                         [eobq], word 0
+
+  vzeroupper
+  RET
+%endmacro
+
+INIT_XMM avx
+QUANTIZE_FN b, 7
+QUANTIZE_FN b_32x32, 7
+
+END
diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.c b/libvpx/vpx_dsp/x86/quantize_sse2.c
index c2a804e1..8aa4568d 100644
--- a/libvpx/vpx_dsp/x86/quantize_sse2.c
+++ b/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -14,11 +14,36 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
+static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
+      (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], (int16_t)coeff_ptr[4],
+      (int16_t)coeff_ptr[5], (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
+#else
+  return _mm_load_si128((const __m128i *)coeff_ptr);
+#endif
+}
+
+static INLINE void store_coefficients(__m128i coeff_vals,
+                                      tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  __m128i one = _mm_set1_epi16(1);
+  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+  _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals_1);
+  _mm_store_si128((__m128i*)(coeff_ptr + 4), coeff_vals_2);
+#else
+  _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals);
+#endif
+}
+
+void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t* zbin_ptr,
                          const int16_t* round_ptr, const int16_t* quant_ptr,
-                         const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
-                         int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                         const int16_t* quant_shift_ptr, tran_low_t* qcoeff_ptr,
+                         tran_low_t* dqcoeff_ptr, const int16_t* dequant_ptr,
                          uint16_t* eob_ptr,
                          const int16_t* scan_ptr,
                          const int16_t* iscan_ptr) {
@@ -56,8 +81,8 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
         __m128i qtmp0, qtmp1;
         __m128i cmp_mask0, cmp_mask1;
         // Do DC and first 15 AC
-        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
+        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
 
         // Poor man's sign extract
         coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -92,15 +117,15 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         dequant = _mm_unpackhi_epi64(dequant, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
+        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -134,8 +159,8 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
         __m128i qtmp0, qtmp1;
         __m128i cmp_mask0, cmp_mask1;
 
-        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
+        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
 
         // Poor man's sign extract
         coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -166,14 +191,14 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
+        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -212,10 +237,10 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
     }
   } else {
     do {
-      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      store_coefficients(zero, dqcoeff_ptr + n_coeffs);
+      store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
+      store_coefficients(zero, qcoeff_ptr + n_coeffs);
+      store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
       n_coeffs += 8 * 2;
     } while (n_coeffs < 0);
     *eob_ptr = 0;
diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
index 3784d9d2..ca215391 100644
--- a/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
@@ -53,15 +53,29 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea                         coeffq, [  coeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+%else
   lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                         iscanq, [  iscanq+ncoeffq*2]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+%endif
+  lea                         iscanq, [  iscanq+ncoeffq*2]
   neg                        ncoeffq
 
   ; get DC and first 15 AC coeffs
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; coeff stored as 32bit numbers & require 16bit numbers
+  mova                            m9, [  coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [  coeffq+ncoeffq*4+16]
+  mova                           m10, [  coeffq+ncoeffq*4+32]
+  packssdw                       m10, [  coeffq+ncoeffq*4+48]
+%else
   mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
@@ -82,8 +96,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                            m8, m7
   pand                           m13, m12
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  mova                           m11, m8
+  mova                            m6, m8
+  pcmpgtw                         m5, m8
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+%else
   mova        [qcoeffq+ncoeffq*2+ 0], m8
   mova        [qcoeffq+ncoeffq*2+16], m13
+%endif
 %ifidn %1, b_32x32
   pabsw                           m8, m8
   pabsw                          m13, m13
@@ -97,8 +131,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                          m8, m9
   psignw                         m13, m10
 %endif
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  mova                            m11, m8
+  mova                            m6, m8
+  pcmpgtw                         m5, m8
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+%else
   mova       [dqcoeffq+ncoeffq*2+ 0], m8
   mova       [dqcoeffq+ncoeffq*2+16], m13
+%endif
   pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
@@ -112,8 +166,16 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   jz .accumulate_eob
 
 .ac_only_loop:
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; pack coeff from 32bit to 16bit array
+  mova                            m9, [  coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [  coeffq+ncoeffq*4+16]
+  mova                           m10, [  coeffq+ncoeffq*4+32]
+  packssdw                       m10, [  coeffq+ncoeffq*4+48]
+%else
   mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
@@ -136,8 +198,29 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                           m14, m7
   pand                           m13, m12
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pxor                           m11, m11
+  mova                           m11, m14
+  mova                            m6, m14
+  pcmpgtw                         m5, m14
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+%else
   mova        [qcoeffq+ncoeffq*2+ 0], m14
   mova        [qcoeffq+ncoeffq*2+16], m13
+%endif
 %ifidn %1, b_32x32
   pabsw                          m14, m14
   pabsw                          m13, m13
@@ -150,8 +233,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m14, m9
   psignw                         m13, m10
 %endif
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  mova                           m11, m14
+  mova                            m6, m14
+  pcmpgtw                         m5, m14
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5
+%else
   mova       [dqcoeffq+ncoeffq*2+ 0], m14
   mova       [dqcoeffq+ncoeffq*2+16], m13
+%endif
   pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
@@ -168,10 +271,21 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %ifidn %1, b_32x32
   jmp .accumulate_eob
 .skip_iter:
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova        [qcoeffq+ncoeffq*4+ 0], m5
+  mova        [qcoeffq+ncoeffq*4+16], m5
+  mova        [qcoeffq+ncoeffq*4+32], m5
+  mova        [qcoeffq+ncoeffq*4+48], m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m5
+  mova       [dqcoeffq+ncoeffq*4+16], m5
+  mova       [dqcoeffq+ncoeffq*4+32], m5
+  mova       [dqcoeffq+ncoeffq*4+48], m5
+%else
   mova        [qcoeffq+ncoeffq*2+ 0], m5
   mova        [qcoeffq+ncoeffq*2+16], m5
   mova       [dqcoeffq+ncoeffq*2+ 0], m5
   mova       [dqcoeffq+ncoeffq*2+16], m5
+%endif
   add                        ncoeffq, mmsize
   jl .ac_only_loop
 %endif
@@ -196,15 +310,31 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mov                             r2, qcoeffmp
   mov                             r3, eobmp
   DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+%else
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+%endif
   neg                        ncoeffq
   pxor                            m7, m7
 .blank_loop:
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova       [dqcoeffq+ncoeffq*4+ 0], m7
+  mova       [dqcoeffq+ncoeffq*4+16], m7
+  mova       [dqcoeffq+ncoeffq*4+32], m7
+  mova       [dqcoeffq+ncoeffq*4+48], m7
+  mova        [qcoeffq+ncoeffq*4+ 0], m7
+  mova        [qcoeffq+ncoeffq*4+16], m7
+  mova        [qcoeffq+ncoeffq*4+32], m7
+  mova        [qcoeffq+ncoeffq*4+48], m7
+%else
   mova       [dqcoeffq+ncoeffq*2+ 0], m7
   mova       [dqcoeffq+ncoeffq*2+16], m7
   mova        [qcoeffq+ncoeffq*2+ 0], m7
   mova        [qcoeffq+ncoeffq*2+16], m7
+%endif
   add                        ncoeffq, mmsize
   jl .blank_loop
   mov                    word [eobq], 0
diff --git a/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
index b2638370..9c5b414b 100644
--- a/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -12,21 +12,77 @@
 
 SECTION .text
 
-%macro convolve_fn 1
+%macro convolve_fn 1-2
 INIT_XMM sse2
+%ifidn %2, highbd
+%define pavg pavgw
+cglobal %2_convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
+                                 fx, fxs, fy, fys, w, h, bd
+%else
+%define pavg pavgb
 cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
                               fx, fxs, fy, fys, w, h
+%endif
   mov r4d, dword wm
+%ifidn %2, highbd
+  shl r4d, 1
+  shl srcq, 1
+  shl src_strideq, 1
+  shl dstq, 1
+  shl dst_strideq, 1
+%else
   cmp r4d, 4
   je .w4
+%endif
   cmp r4d, 8
   je .w8
   cmp r4d, 16
   je .w16
   cmp r4d, 32
   je .w32
+%ifidn %2, highbd
+  cmp r4d, 64
+  je .w64
 
   mov                    r4d, dword hm
+.loop128:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  add                   dstq, dst_strideq
+  dec                    r4d
+  jnz .loop128
+  RET
+%endif
+
+.w64
+  mov                    r4d, dword hm
 .loop64:
   movu                    m0, [srcq]
   movu                    m1, [srcq+16]
@@ -34,10 +90,10 @@ cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
   movu                    m3, [srcq+48]
   add                   srcq, src_strideq
 %ifidn %1, avg
-  pavgb                   m0, [dstq]
-  pavgb                   m1, [dstq+16]
-  pavgb                   m2, [dstq+32]
-  pavgb                   m3, [dstq+48]
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
 %endif
   mova             [dstq   ], m0
   mova             [dstq+16], m1
@@ -57,10 +113,10 @@ cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
   movu                    m3, [srcq+src_strideq+16]
   lea                   srcq, [srcq+src_strideq*2]
 %ifidn %1, avg
-  pavgb                   m0, [dstq]
-  pavgb                   m1, [dstq            +16]
-  pavgb                   m2, [dstq+dst_strideq]
-  pavgb                   m3, [dstq+dst_strideq+16]
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq            +16]
+  pavg                    m2, [dstq+dst_strideq]
+  pavg                    m3, [dstq+dst_strideq+16]
 %endif
   mova [dstq               ], m0
   mova [dstq            +16], m1
@@ -82,10 +138,10 @@ cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
   movu                    m3, [srcq+r5q]
   lea                   srcq, [srcq+src_strideq*4]
 %ifidn %1, avg
-  pavgb                   m0, [dstq]
-  pavgb                   m1, [dstq+dst_strideq]
-  pavgb                   m2, [dstq+dst_strideq*2]
-  pavgb                   m3, [dstq+r6q]
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+dst_strideq]
+  pavg                    m2, [dstq+dst_strideq*2]
+  pavg                    m3, [dstq+r6q]
 %endif
   mova  [dstq              ], m0
   mova  [dstq+dst_strideq  ], m1
@@ -108,10 +164,10 @@ INIT_MMX sse
   movu                    m3, [srcq+r5q]
   lea                   srcq, [srcq+src_strideq*4]
 %ifidn %1, avg
-  pavgb                   m0, [dstq]
-  pavgb                   m1, [dstq+dst_strideq]
-  pavgb                   m2, [dstq+dst_strideq*2]
-  pavgb                   m3, [dstq+r6q]
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+dst_strideq]
+  pavg                    m2, [dstq+dst_strideq*2]
+  pavg                    m3, [dstq+r6q]
 %endif
   mova  [dstq              ], m0
   mova  [dstq+dst_strideq  ], m1
@@ -122,6 +178,7 @@ INIT_MMX sse
   jnz .loop8
   RET
 
+%ifnidn %2, highbd
 .w4:
   mov                    r4d, dword hm
   lea                    r5q, [src_strideq*3]
@@ -137,10 +194,10 @@ INIT_MMX sse
   movh                    m5, [dstq+dst_strideq]
   movh                    m6, [dstq+dst_strideq*2]
   movh                    m7, [dstq+r6q]
-  pavgb                   m0, m4
-  pavgb                   m1, m5
-  pavgb                   m2, m6
-  pavgb                   m3, m7
+  pavg                    m0, m4
+  pavg                    m1, m5
+  pavg                    m2, m6
+  pavg                    m3, m7
 %endif
   movh  [dstq              ], m0
   movh  [dstq+dst_strideq  ], m1
@@ -150,7 +207,12 @@ INIT_MMX sse
   sub                    r4d, 4
   jnz .loop4
   RET
+%endif
 %endmacro
 
 convolve_fn copy
 convolve_fn avg
+%if CONFIG_VP9_HIGHBITDEPTH
+convolve_fn copy, highbd
+convolve_fn avg, highbd
+%endif
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 29ede19f..b7186785 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -41,7 +41,10 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
 
 #if defined(__clang__)
 # if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
-      (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0)
+    (defined(__APPLE__) && \
+        ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
+            (__clang_major__ == 5 && __clang_minor__ == 0)))
+
 #  define MM256_BROADCASTSI128_SI256(x) \
        _mm_broadcastsi128_si256((__m128i const *)&(x))
 # else  // clang > 3.3, and not 5.0 on macosx.
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 772e01e8..6fd52087 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -203,123 +203,6 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
   }
 }
 
-static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
-                                                 ptrdiff_t src_pixels_per_line,
-                                                 uint8_t *output_ptr,
-                                                 ptrdiff_t output_pitch,
-                                                 uint32_t output_height,
-                                                 const int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
-  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 128 bit register
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 128 bit register
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-                                   _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-
-    // reading the next 16 bytes.
-    // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-                                   _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-    // filter the source buffer
-    srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
-    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
-    src_ptr+=src_pixels_per_line;
-
-    // save 16 bytes
-    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
-
-    output_ptr+=output_pitch;
-  }
-}
-
 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
                                          ptrdiff_t src_pitch,
                                          uint8_t *output_ptr,
@@ -408,141 +291,12 @@ void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
   }
 }
 
-static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
-                                                 ptrdiff_t src_pitch,
-                                                 uint8_t *output_ptr,
-                                                 ptrdiff_t out_pitch,
-                                                 uint32_t output_height,
-                                                 const int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
-  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
-  __m128i srcReg8;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits in the filter
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits in the filter
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits in the filter
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  // load the first 7 rows of 16 bytes
-  srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-  srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-
-  for (i = 0; i < output_height; i++) {
-    // load the last 16 bytes
-    srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the result together
-    srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
-    srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-    srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
-    srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
-    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
-    // merge the result together
-    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-    srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
-
-    // merge the result together
-    srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
-    srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-                                 _mm_min_epi16(srcRegFilt3, srcRegFilt7));
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                                 _mm_min_epi16(srcRegFilt6, srcRegFilt8));
-
-    // add and saturate the results together
-    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-                                 _mm_max_epi16(srcRegFilt3, srcRegFilt7));
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                                 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
-    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
-
-    src_ptr+=src_pitch;
-
-    // shift down a row
-    srcReg1 = srcReg2;
-    srcReg2 = srcReg3;
-    srcReg3 = srcReg4;
-    srcReg4 = srcReg5;
-    srcReg5 = srcReg6;
-    srcReg6 = srcReg7;
-    srcReg7 = srcReg8;
-
-    // save 16 bytes convolve result
-    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
-
-    output_ptr+=out_pitch;
-  }
-}
-
-#if ARCH_X86_64
-filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-#define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3
-#define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3
-#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
-#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
-#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
-#else  // ARCH_X86
 filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
 filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-#endif  // ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
index 68acc03c..3fbaa274 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -1,5 +1,5 @@
 ;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
@@ -8,1064 +8,662 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
+%include "third_party/x86inc/x86inc.asm"
 
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro VERTx4 1
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movq        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-    movdqa      krd, xmm5
-
-    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
-%endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-    add         rax, rdx
-
-    lea         rbx, [rdx + rdx*4]
-    add         rbx, rdx                    ;pitch * 6
-
-.loop:
-    movd        xmm0, [rsi]                 ;A
-    movd        xmm1, [rsi + rdx]           ;B
-    movd        xmm2, [rsi + rdx * 2]       ;C
-    movd        xmm3, [rax + rdx * 2]       ;D
-    movd        xmm4, [rsi + rdx * 4]       ;E
-    movd        xmm5, [rax + rdx * 4]       ;F
-
-    punpcklbw   xmm0, xmm1                  ;A B
-    punpcklbw   xmm2, xmm3                  ;C D
-    punpcklbw   xmm4, xmm5                  ;E F
-
-    movd        xmm6, [rsi + rbx]           ;G
-    movd        xmm7, [rax + rbx]           ;H
-
-    pmaddubsw   xmm0, k0k1
-    pmaddubsw   xmm2, k2k3
-    punpcklbw   xmm6, xmm7                  ;G H
-    pmaddubsw   xmm4, k4k5
-    pmaddubsw   xmm6, k6k7
-
-    movdqa      xmm1, xmm2
-    paddsw      xmm0, xmm6
-    pmaxsw      xmm2, xmm4
-    pminsw      xmm4, xmm1
-    paddsw      xmm0, xmm4
-    paddsw      xmm0, xmm2
+SECTION_RODATA
+pw_64:    times 8 dw 64
 
-    paddsw      xmm0, krd
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
+; %define USE_PMULHRSW
+; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
+; when using this instruction.
 
-    add         rsi,  rdx
-    add         rax,  rdx
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
+SECTION .text
+%if ARCH_X86_64
+  %define LOCAL_VARS_SIZE 16*4
+%else
+  %define LOCAL_VARS_SIZE 16*6
 %endif
-    movd        [rdi], xmm0
 
-%if ABI_IS_32BIT
-    add         rdi, DWORD PTR arg(3)       ;out_pitch
+%macro SETUP_LOCAL_VARS 0
+    ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
+    ; pmaddubsw has a higher latency on some platforms, this might be eased by
+    ; interleaving the instructions.
+    %define    k0k1  [rsp + 16*0]
+    %define    k2k3  [rsp + 16*1]
+    %define    k4k5  [rsp + 16*2]
+    %define    k6k7  [rsp + 16*3]
+    packsswb     m4, m4
+    ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
+    ; some platforms.
+    pshuflw      m0, m4, 0b              ;k0_k1
+    pshuflw      m1, m4, 01010101b       ;k2_k3
+    pshuflw      m2, m4, 10101010b       ;k4_k5
+    pshuflw      m3, m4, 11111111b       ;k6_k7
+    punpcklqdq   m0, m0
+    punpcklqdq   m1, m1
+    punpcklqdq   m2, m2
+    punpcklqdq   m3, m3
+    mova       k0k1, m0
+    mova       k2k3, m1
+    mova       k4k5, m2
+    mova       k6k7, m3
+%if ARCH_X86_64
+    %define     krd  m12
+    %define     tmp  m13
+    mova        krd, [GLOBAL(pw_64)]
+%else
+    %define     tmp  [rsp + 16*4]
+    %define     krd  [rsp + 16*5]
+%if CONFIG_PIC=0
+    mova         m6, [GLOBAL(pw_64)]
 %else
-    add         rdi, r8
+    ; build constants without accessing global memory
+    pcmpeqb      m6, m6                  ;all ones
+    psrlw        m6, 15
+    psllw        m6, 6                   ;aka pw_64
 %endif
-    dec         rcx
-    jnz         .loop
-%endm
-
-%macro VERTx8 1
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movq        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-    movdqa      krd, xmm5
-
-    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
+    mova        krd, m6
 %endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-    add         rax, rdx
-
-    lea         rbx, [rdx + rdx*4]
-    add         rbx, rdx                    ;pitch * 6
-
-.loop:
-    movq        xmm0, [rsi]                 ;A
-    movq        xmm1, [rsi + rdx]           ;B
-    movq        xmm2, [rsi + rdx * 2]       ;C
-    movq        xmm3, [rax + rdx * 2]       ;D
-    movq        xmm4, [rsi + rdx * 4]       ;E
-    movq        xmm5, [rax + rdx * 4]       ;F
-
-    punpcklbw   xmm0, xmm1                  ;A B
-    punpcklbw   xmm2, xmm3                  ;C D
-    punpcklbw   xmm4, xmm5                  ;E F
-
-    movq        xmm6, [rsi + rbx]           ;G
-    movq        xmm7, [rax + rbx]           ;H
-
-    pmaddubsw   xmm0, k0k1
-    pmaddubsw   xmm2, k2k3
-    punpcklbw   xmm6, xmm7                  ;G H
-    pmaddubsw   xmm4, k4k5
-    pmaddubsw   xmm6, k6k7
-
-    paddsw      xmm0, xmm6
-    movdqa      xmm1, xmm2
-    pmaxsw      xmm2, xmm4
-    pminsw      xmm4, xmm1
-    paddsw      xmm0, xmm4
-    paddsw      xmm0, xmm2
-
-    paddsw      xmm0, krd
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
+%endm
 
-    add         rsi,  rdx
-    add         rax,  rdx
-%if %1
-    movq        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi], xmm0
+%macro HORIZx4_ROW 2
+    mova      %2, %1
+    punpcklbw %1, %1
+    punpckhbw %2, %2
+
+    mova      m3, %2
+    palignr   %2, %1, 1
+    palignr   m3, %1, 5
+
+    pmaddubsw %2, k0k1k4k5
+    pmaddubsw m3, k2k3k6k7
+
+    mova      m4, %2
+    mova      m5, m3
+    psrldq    %2, 8
+    psrldq    m3, 8
+    mova      m6, m5
+
+    paddsw    m4, m3
+    pmaxsw    m5, %2
+    pminsw    %2, m6
+    paddsw    %2, m4
+    paddsw    %2, m5
+    paddsw    %2, krd
+    psraw     %2, 7
+    packuswb  %2, %2
+%endm
 
-%if ABI_IS_32BIT
-    add         rdi, DWORD PTR arg(3)       ;out_pitch
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER4 1
+cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
+                            src, sstride, dst, dstride, height, filter
+    mova                m4, [filterq]
+    packsswb            m4, m4
+%if ARCH_X86_64
+    %define       k0k1k4k5 m8
+    %define       k2k3k6k7 m9
+    %define            krd m10
+    %define    orig_height r7d
+    mova               krd, [GLOBAL(pw_64)]
+    pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
+    pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
+    pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
+    pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
 %else
-    add         rdi, r8
+    %define       k0k1k4k5 [rsp + 16*0]
+    %define       k2k3k6k7 [rsp + 16*1]
+    %define            krd [rsp + 16*2]
+    %define    orig_height [rsp + 16*3]
+    pshuflw             m6, m4, 0b              ;k0_k1
+    pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
+    pshuflw             m7, m4, 01010101b       ;k2_k3
+    pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
+%if CONFIG_PIC=0
+    mova                m1, [GLOBAL(pw_64)]
+%else
+    ; build constants without accessing global memory
+    pcmpeqb             m1, m1                  ;all ones
+    psrlw               m1, 15
+    psllw               m1, 6                   ;aka pw_64
 %endif
-    dec         rcx
-    jnz         .loop
-%endm
-
-
-%macro VERTx16 1
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movq        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-    movdqa      krd, xmm5
-
-    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
+    mova          k0k1k4k5, m6
+    mova          k2k3k6k7, m7
+    mova               krd, m1
 %endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-    add         rax, rdx
-
-    lea         rbx, [rdx + rdx*4]
-    add         rbx, rdx                    ;pitch * 6
-
+    mov        orig_height, heightd
+    shr            heightd, 1
 .loop:
-    movq        xmm0, [rsi]                 ;A
-    movq        xmm1, [rsi + rdx]           ;B
-    movq        xmm2, [rsi + rdx * 2]       ;C
-    movq        xmm3, [rax + rdx * 2]       ;D
-    movq        xmm4, [rsi + rdx * 4]       ;E
-    movq        xmm5, [rax + rdx * 4]       ;F
-
-    punpcklbw   xmm0, xmm1                  ;A B
-    punpcklbw   xmm2, xmm3                  ;C D
-    punpcklbw   xmm4, xmm5                  ;E F
-
-    movq        xmm6, [rsi + rbx]           ;G
-    movq        xmm7, [rax + rbx]           ;H
-
-    pmaddubsw   xmm0, k0k1
-    pmaddubsw   xmm2, k2k3
-    punpcklbw   xmm6, xmm7                  ;G H
-    pmaddubsw   xmm4, k4k5
-    pmaddubsw   xmm6, k6k7
-
-    paddsw      xmm0, xmm6
-    movdqa      xmm1, xmm2
-    pmaxsw      xmm2, xmm4
-    pminsw      xmm4, xmm1
-    paddsw      xmm0, xmm4
-    paddsw      xmm0, xmm2
-
-    paddsw      xmm0, krd
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
-%if %1
-    movq        xmm1, [rdi]
-    pavgb       xmm0, xmm1
+    ;Do two rows at once
+    movh                m0, [srcq - 3]
+    movh                m1, [srcq + 5]
+    punpcklqdq          m0, m1
+    mova                m1, m0
+    movh                m2, [srcq + sstrideq - 3]
+    movh                m3, [srcq + sstrideq + 5]
+    punpcklqdq          m2, m3
+    mova                m3, m2
+    punpcklbw           m0, m0
+    punpckhbw           m1, m1
+    punpcklbw           m2, m2
+    punpckhbw           m3, m3
+    mova                m4, m1
+    palignr             m4, m0,  1
+    pmaddubsw           m4, k0k1k4k5
+    palignr             m1, m0,  5
+    pmaddubsw           m1, k2k3k6k7
+    mova                m7, m3
+    palignr             m7, m2,  1
+    pmaddubsw           m7, k0k1k4k5
+    palignr             m3, m2,  5
+    pmaddubsw           m3, k2k3k6k7
+    mova                m0, m4
+    mova                m5, m1
+    mova                m2, m7
+    psrldq              m4, 8
+    psrldq              m1, 8
+    mova                m6, m5
+    paddsw              m0, m1
+    mova                m1, m3
+    psrldq              m7, 8
+    psrldq              m3, 8
+    paddsw              m2, m3
+    mova                m3, m1
+    pmaxsw              m5, m4
+    pminsw              m4, m6
+    paddsw              m4, m0
+    paddsw              m4, m5
+    pmaxsw              m1, m7
+    pminsw              m7, m3
+    paddsw              m7, m2
+    paddsw              m7, m1
+
+    paddsw              m4, krd
+    psraw               m4, 7
+    packuswb            m4, m4
+    paddsw              m7, krd
+    psraw               m7, 7
+    packuswb            m7, m7
+
+%ifidn %1, h8_avg
+    movd                m0, [dstq]
+    pavgb               m4, m0
+    movd                m2, [dstq + dstrideq]
+    pavgb               m7, m2
 %endif
-    movq        [rdi], xmm0
-
-    movq        xmm0, [rsi + 8]             ;A
-    movq        xmm1, [rsi + rdx + 8]       ;B
-    movq        xmm2, [rsi + rdx * 2 + 8]   ;C
-    movq        xmm3, [rax + rdx * 2 + 8]   ;D
-    movq        xmm4, [rsi + rdx * 4 + 8]   ;E
-    movq        xmm5, [rax + rdx * 4 + 8]   ;F
-
-    punpcklbw   xmm0, xmm1                  ;A B
-    punpcklbw   xmm2, xmm3                  ;C D
-    punpcklbw   xmm4, xmm5                  ;E F
+    movd            [dstq], m4
+    movd [dstq + dstrideq], m7
 
-    movq        xmm6, [rsi + rbx + 8]       ;G
-    movq        xmm7, [rax + rbx + 8]       ;H
-    punpcklbw   xmm6, xmm7                  ;G H
+    lea               srcq, [srcq + sstrideq        ]
+    prefetcht0              [srcq + 4 * sstrideq - 3]
+    lea               srcq, [srcq + sstrideq        ]
+    lea               dstq, [dstq + 2 * dstrideq    ]
+    prefetcht0              [srcq + 2 * sstrideq - 3]
 
-    pmaddubsw   xmm0, k0k1
-    pmaddubsw   xmm2, k2k3
-    pmaddubsw   xmm4, k4k5
-    pmaddubsw   xmm6, k6k7
+    dec            heightd
+    jnz              .loop
 
-    paddsw      xmm0, xmm6
-    movdqa      xmm1, xmm2
-    pmaxsw      xmm2, xmm4
-    pminsw      xmm4, xmm1
-    paddsw      xmm0, xmm4
-    paddsw      xmm0, xmm2
-
-    paddsw      xmm0, krd
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
-
-    add         rsi,  rdx
-    add         rax,  rdx
-%if %1
-    movq    xmm1, [rdi+8]
-    pavgb   xmm0, xmm1
-%endif
-
-    movq        [rdi+8], xmm0
-
-%if ABI_IS_32BIT
-    add         rdi, DWORD PTR arg(3)       ;out_pitch
-%else
-    add         rdi, r8
+    ; Do last row if output_height is odd
+    mov            heightd, orig_height
+    and            heightd, 1
+    je               .done
+
+    movh                m0, [srcq - 3]    ; load src
+    movh                m1, [srcq + 5]
+    punpcklqdq          m0, m1
+
+    HORIZx4_ROW         m0, m1
+%ifidn %1, h8_avg
+    movd                m0, [dstq]
+    pavgb               m1, m0
 %endif
-    dec         rcx
-    jnz         .loop
+    movd            [dstq], m1
+.done
+    RET
 %endm
 
-;void vpx_filter_block1d8_v8_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d4_v8_ssse3) PRIVATE
-sym(vpx_filter_block1d4_v8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    VERTx4 0
-
-    add rsp, 16*5
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d8_v8_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d8_v8_ssse3) PRIVATE
-sym(vpx_filter_block1d8_v8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    VERTx8 0
-
-    add rsp, 16*5
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d16_v8_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d16_v8_ssse3) PRIVATE
-sym(vpx_filter_block1d16_v8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    VERTx16 0
-
-    add rsp, 16*5
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-
-global sym(vpx_filter_block1d4_v8_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d4_v8_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    VERTx4 1
-
-    add rsp, 16*5
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_v8_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d8_v8_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    VERTx8 1
-
-    add rsp, 16*5
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_v8_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d16_v8_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    VERTx16 1
-
-    add rsp, 16*5
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%macro HORIZx4_ROW 2
-    movdqa      %2,   %1
-    pshufb      %1,   [GLOBAL(shuf_t0t1)]
-    pshufb      %2,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   %1,   k0k1k4k5
-    pmaddubsw   %2,   k2k3k6k7
-
-    movdqa      xmm4, %1
-    movdqa      xmm5, %2
-    psrldq      %1,   8
-    psrldq      %2,   8
-    movdqa      xmm6, xmm5
-
-    paddsw      xmm4, %2
-    pmaxsw      xmm5, %1
-    pminsw      %1, xmm6
-    paddsw      %1, xmm4
-    paddsw      %1, xmm5
-
-    paddsw      %1,   krd
-    psraw       %1,   7
-    packuswb    %1,   %1
+%macro HORIZx8_ROW 5
+    mova        %2, %1
+    punpcklbw   %1, %1
+    punpckhbw   %2, %2
+
+    mova        %3, %2
+    mova        %4, %2
+    mova        %5, %2
+
+    palignr     %2, %1, 1
+    palignr     %3, %1, 5
+    palignr     %4, %1, 9
+    palignr     %5, %1, 13
+
+    pmaddubsw   %2, k0k1
+    pmaddubsw   %3, k2k3
+    pmaddubsw   %4, k4k5
+    pmaddubsw   %5, k6k7
+
+    paddsw      %2, %5
+    mova        %1, %3
+    pminsw      %3, %4
+    pmaxsw      %1, %4
+    paddsw      %2, %3
+    paddsw      %1, %2
+    paddsw      %1, krd
+    psraw       %1, 7
+    packuswb    %1, %1
 %endm
 
-%macro HORIZx4 1
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movq        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm6, xmm4, 0b              ;k0_k1
-    pshufhw     xmm6, xmm6, 10101010b       ;k0_k1_k4_k5
-    pshuflw     xmm7, xmm4, 01010101b       ;k2_k3
-    pshufhw     xmm7, xmm7, 11111111b       ;k2_k3_k6_k7
-    pshufd      xmm5, xmm5, 0               ;rounding
-
-    movdqa      k0k1k4k5, xmm6
-    movdqa      k2k3k6k7, xmm7
-    movdqa      krd, xmm5
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER8 1
+cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \
+                            src, sstride, dst, dstride, height, filter
+    mova                 m4, [filterq]
+    SETUP_LOCAL_VARS
+%if ARCH_X86_64
+    %define     orig_height r7d
+%else
+    %define     orig_height heightmp
+%endif
+    mov         orig_height, heightd
+    shr             heightd, 1
 
-    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
-    movsxd      rdx, dword ptr arg(3)       ;output_pitch
-    movsxd      rcx, dword ptr arg(4)       ;output_height
-    shr         rcx, 1
 .loop:
-    ;Do two rows once
-    movq        xmm0,   [rsi - 3]           ;load src
-    movq        xmm1,   [rsi + 5]
-    movq        xmm2,   [rsi + rax - 3]
-    movq        xmm3,   [rsi + rax + 5]
-    punpcklqdq  xmm0,   xmm1
-    punpcklqdq  xmm2,   xmm3
-
-    HORIZx4_ROW xmm0,   xmm1
-    HORIZx4_ROW xmm2,   xmm3
-%if %1
-    movd        xmm1,   [rdi]
-    pavgb       xmm0,   xmm1
-    movd        xmm3,   [rdi + rdx]
-    pavgb       xmm2,   xmm3
+    movh                 m0, [srcq - 3]
+    movh                 m3, [srcq + 5]
+    movh                 m4, [srcq + sstrideq - 3]
+    movh                 m7, [srcq + sstrideq + 5]
+    punpcklqdq           m0, m3
+    mova                 m1, m0
+    punpcklbw            m0, m0
+    punpckhbw            m1, m1
+    mova                 m5, m1
+    palignr              m5, m0, 13
+    pmaddubsw            m5, k6k7
+    mova                 m2, m1
+    mova                 m3, m1
+    palignr              m1, m0, 1
+    pmaddubsw            m1, k0k1
+    punpcklqdq           m4, m7
+    mova                 m6, m4
+    punpcklbw            m4, m4
+    palignr              m2, m0, 5
+    punpckhbw            m6, m6
+    palignr              m3, m0, 9
+    mova                 m7, m6
+    pmaddubsw            m2, k2k3
+    pmaddubsw            m3, k4k5
+
+    palignr              m7, m4, 13
+    paddsw               m1, m5
+    mova                 m5, m6
+    mova                 m0, m2
+    palignr              m5, m4, 5
+    pminsw               m2, m3
+    pmaddubsw            m7, k6k7
+    pmaxsw               m3, m0
+    paddsw               m1, m2
+    mova                 m0, m6
+    palignr              m6, m4, 1
+    pmaddubsw            m5, k2k3
+    paddsw               m1, m3
+    pmaddubsw            m6, k0k1
+    palignr              m0, m4, 9
+    paddsw               m1, krd
+    pmaddubsw            m0, k4k5
+    mova                 m4, m5
+    psraw                m1, 7
+    pminsw               m5, m0
+    paddsw               m6, m7
+    packuswb             m1, m1
+
+    paddsw               m6, m5
+    pmaxsw               m0, m4
+    paddsw               m6, m0
+    paddsw               m6, krd
+    psraw                m6, 7
+    packuswb             m6, m6
+
+%ifidn %1, h8_avg
+    movh                 m0, [dstq]
+    movh                 m2, [dstq + dstrideq]
+    pavgb                m1, m0
+    pavgb                m6, m2
 %endif
-    movd        [rdi],  xmm0
-    movd        [rdi +rdx],  xmm2
+    movh             [dstq], m1
+    movh  [dstq + dstrideq], m6
 
-    lea         rsi,    [rsi + rax]
-    prefetcht0  [rsi + 4 * rax - 3]
-    lea         rsi,    [rsi + rax]
-    lea         rdi,    [rdi + 2 * rdx]
-    prefetcht0  [rsi + 2 * rax - 3]
+    lea                srcq, [srcq + sstrideq        ]
+    prefetcht0               [srcq + 4 * sstrideq - 3]
+    lea                srcq, [srcq + sstrideq        ]
+    lea                dstq, [dstq + 2 * dstrideq    ]
+    prefetcht0               [srcq + 2 * sstrideq - 3]
+    dec             heightd
+    jnz             .loop
 
-    dec         rcx
-    jnz         .loop
+    ;Do last row if output_height is odd
+    mov             heightd, orig_height
+    and             heightd, 1
+    je                .done
 
-    ; Do last row if output_height is odd
-    movsxd      rcx,    dword ptr arg(4)       ;output_height
-    and         rcx,    1
-    je          .done
+    movh                 m0, [srcq - 3]
+    movh                 m3, [srcq + 5]
+    punpcklqdq           m0, m3
 
-    movq        xmm0,   [rsi - 3]    ; load src
-    movq        xmm1,   [rsi + 5]
-    punpcklqdq  xmm0,   xmm1
+    HORIZx8_ROW          m0, m1, m2, m3, m4
 
-    HORIZx4_ROW xmm0, xmm1
-%if %1
-    movd        xmm1,   [rdi]
-    pavgb       xmm0,   xmm1
+%ifidn %1, h8_avg
+    movh                 m1, [dstq]
+    pavgb                m0, m1
 %endif
-    movd        [rdi],  xmm0
-.done
+    movh             [dstq], m0
+.done:
+    RET
 %endm
 
-%macro HORIZx8_ROW 4
-    movdqa      %2,   %1
-    movdqa      %3,   %1
-    movdqa      %4,   %1
-
-    pshufb      %1,   [GLOBAL(shuf_t0t1)]
-    pshufb      %2,   [GLOBAL(shuf_t2t3)]
-    pshufb      %3,   [GLOBAL(shuf_t4t5)]
-    pshufb      %4,   [GLOBAL(shuf_t6t7)]
-
-    pmaddubsw   %1,   k0k1
-    pmaddubsw   %2,   k2k3
-    pmaddubsw   %3,   k4k5
-    pmaddubsw   %4,   k6k7
-
-    paddsw      %1,   %4
-    movdqa      %4,   %2
-    pmaxsw      %2,   %3
-    pminsw      %3,   %4
-    paddsw      %1,   %3
-    paddsw      %1,   %2
-
-    paddsw      %1,   krd
-    psraw       %1,   7
-    packuswb    %1,   %1
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER16 1
+cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+    mova          m4, [filterq]
+    SETUP_LOCAL_VARS
+.loop:
+    prefetcht0        [srcq + 2 * sstrideq -3]
+
+    movh          m0, [srcq -  3]
+    movh          m4, [srcq +  5]
+    movh          m6, [srcq + 13]
+    punpcklqdq    m0, m4
+    mova          m7, m0
+    punpckhbw     m0, m0
+    mova          m1, m0
+    punpcklqdq    m4, m6
+    mova          m3, m0
+    punpcklbw     m7, m7
+
+    palignr       m3, m7, 13
+    mova          m2, m0
+    pmaddubsw     m3, k6k7
+    palignr       m0, m7, 1
+    pmaddubsw     m0, k0k1
+    palignr       m1, m7, 5
+    pmaddubsw     m1, k2k3
+    palignr       m2, m7, 9
+    pmaddubsw     m2, k4k5
+    paddsw        m0, m3
+    mova          m3, m4
+    punpckhbw     m4, m4
+    mova          m5, m4
+    punpcklbw     m3, m3
+    mova          m7, m4
+    palignr       m5, m3, 5
+    mova          m6, m4
+    palignr       m4, m3, 1
+    pmaddubsw     m4, k0k1
+    pmaddubsw     m5, k2k3
+    palignr       m6, m3, 9
+    pmaddubsw     m6, k4k5
+    palignr       m7, m3, 13
+    pmaddubsw     m7, k6k7
+
+    mova          m3, m1
+    pmaxsw        m1, m2
+    pminsw        m2, m3
+    paddsw        m0, m2
+    paddsw        m0, m1
+    paddsw        m4, m7
+    mova          m7, m5
+    pmaxsw        m5, m6
+    pminsw        m6, m7
+    paddsw        m4, m6
+    paddsw        m4, m5
+    paddsw        m0, krd
+    paddsw        m4, krd
+    psraw         m0, 7
+    psraw         m4, 7
+    packuswb      m0, m4
+%ifidn %1, h8_avg
+    mova          m1, [dstq]
+    pavgb         m0, m1
+%endif
+    lea         srcq, [srcq + sstrideq]
+    mova      [dstq], m0
+    lea         dstq, [dstq + dstrideq]
+    dec      heightd
+    jnz        .loop
+    RET
 %endm
 
-%macro HORIZx8 1
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movq        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-    movdqa      krd, xmm5
-
-    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
-    movsxd      rdx, dword ptr arg(3)       ;output_pitch
-    movsxd      rcx, dword ptr arg(4)       ;output_height
-    shr         rcx, 1
+INIT_XMM ssse3
+SUBPIX_HFILTER16 h8
+SUBPIX_HFILTER16 h8_avg
+SUBPIX_HFILTER8  h8
+SUBPIX_HFILTER8  h8_avg
+SUBPIX_HFILTER4  h8
+SUBPIX_HFILTER4  h8_avg
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_VFILTER 2
+cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+    mova          m4, [filterq]
+    SETUP_LOCAL_VARS
+%if ARCH_X86_64
+    %define      src1q r7
+    %define  sstride6q r8
+    %define dst_stride dstrideq
+%else
+    %define      src1q filterq
+    %define  sstride6q dstrideq
+    %define dst_stride dstridemp
+%endif
+    mov       src1q, srcq
+    add       src1q, sstrideq
+    lea   sstride6q, [sstrideq + sstrideq * 4]
+    add   sstride6q, sstrideq                   ;pitch * 6
 
+%ifidn %2, 8
+    %define movx movh
+%else
+    %define movx movd
+%endif
 .loop:
-    movq        xmm0,   [rsi - 3]           ;load src
-    movq        xmm3,   [rsi + 5]
-    movq        xmm4,   [rsi + rax - 3]
-    movq        xmm7,   [rsi + rax + 5]
-    punpcklqdq  xmm0,   xmm3
-    punpcklqdq  xmm4,   xmm7
-
-    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
-    HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
-%if %1
-    movq        xmm1,   [rdi]
-    movq        xmm2,   [rdi + rdx]
-    pavgb       xmm0,   xmm1
-    pavgb       xmm4,   xmm2
+    movx         m0, [srcq                ]     ;A
+    movx         m1, [srcq + sstrideq     ]     ;B
+    punpcklbw    m0, m1                         ;A B
+    movx         m2, [srcq + sstrideq * 2 ]     ;C
+    pmaddubsw    m0, k0k1
+    mova         m6, m2
+    movx         m3, [src1q + sstrideq * 2]     ;D
+    punpcklbw    m2, m3                         ;C D
+    pmaddubsw    m2, k2k3
+    movx         m4, [srcq + sstrideq * 4 ]     ;E
+    mova         m7, m4
+    movx         m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw    m4, m5                         ;E F
+    pmaddubsw    m4, k4k5
+    punpcklbw    m1, m6                         ;A B next iter
+    movx         m6, [srcq + sstride6q    ]     ;G
+    punpcklbw    m5, m6                         ;E F next iter
+    punpcklbw    m3, m7                         ;C D next iter
+    pmaddubsw    m5, k4k5
+    movx         m7, [src1q + sstride6q   ]     ;H
+    punpcklbw    m6, m7                         ;G H
+    pmaddubsw    m6, k6k7
+    mova        tmp, m2
+    pmaddubsw    m3, k2k3
+    pmaddubsw    m1, k0k1
+    pmaxsw       m2, m4
+    paddsw       m0, m6
+    movx         m6, [srcq + sstrideq * 8 ]     ;H next iter
+    punpcklbw    m7, m6
+    pmaddubsw    m7, k6k7
+    pminsw       m4, tmp
+    paddsw       m0, m4
+    mova         m4, m3
+    paddsw       m0, m2
+    pminsw       m3, m5
+    pmaxsw       m5, m4
+    paddsw       m0, krd
+    psraw        m0, 7
+    paddsw       m1, m7
+    packuswb     m0, m0
+
+    paddsw       m1, m3
+    paddsw       m1, m5
+    paddsw       m1, krd
+    psraw        m1, 7
+    lea        srcq, [srcq + sstrideq * 2 ]
+    lea       src1q, [src1q + sstrideq * 2]
+    packuswb     m1, m1
+
+%ifidn %1, v8_avg
+    movx         m2, [dstq]
+    pavgb        m0, m2
 %endif
-    movq        [rdi],  xmm0
-    movq        [rdi + rdx],  xmm4
-
-    lea         rsi,    [rsi + rax]
-    prefetcht0  [rsi + 4 * rax - 3]
-    lea         rsi,    [rsi + rax]
-    lea         rdi,    [rdi + 2 * rdx]
-    prefetcht0  [rsi + 2 * rax - 3]
-    dec         rcx
-    jnz         .loop
-
-    ;Do last row if output_height is odd
-    movsxd      rcx,    dword ptr arg(4)    ;output_height
-    and         rcx,    1
-    je          .done
-
-    movq        xmm0,   [rsi - 3]
-    movq        xmm3,   [rsi + 5]
-    punpcklqdq  xmm0,   xmm3
-
-    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
-%if %1
-    movq        xmm1,   [rdi]
-    pavgb       xmm0,   xmm1
+    movx     [dstq], m0
+    add        dstq, dst_stride
+%ifidn %1, v8_avg
+    movx         m3, [dstq]
+    pavgb        m1, m3
 %endif
-    movq        [rdi],  xmm0
-.done
+    movx     [dstq], m1
+    add        dstq, dst_stride
+    sub     heightd, 2
+    cmp     heightd, 1
+    jg        .loop
+
+    cmp     heightd, 0
+    je        .done
+
+    movx         m0, [srcq                ]     ;A
+    movx         m1, [srcq + sstrideq     ]     ;B
+    movx         m6, [srcq + sstride6q    ]     ;G
+    punpcklbw    m0, m1                         ;A B
+    movx         m7, [rax + sstride6q     ]     ;H
+    pmaddubsw    m0, k0k1
+    movx         m2, [srcq + sstrideq * 2 ]     ;C
+    punpcklbw    m6, m7                         ;G H
+    movx         m3, [rax + sstrideq * 2  ]     ;D
+    pmaddubsw    m6, k6k7
+    movx         m4, [srcq + sstrideq * 4 ]     ;E
+    punpcklbw    m2, m3                         ;C D
+    movx         m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw    m4, m5                         ;E F
+    pmaddubsw    m2, k2k3
+    pmaddubsw    m4, k4k5
+    paddsw       m0, m6
+    mova         m1, m2
+    pmaxsw       m2, m4
+    pminsw       m4, m1
+    paddsw       m0, m4
+    paddsw       m0, m2
+    paddsw       m0, krd
+    psraw        m0, 7
+    packuswb     m0, m0
+%ifidn %1, v8_avg
+    movx         m1, [dstq]
+    pavgb        m0, m1
+%endif
+    movx     [dstq], m0
+.done:
+    RET
 %endm
 
-%macro HORIZx16 1
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movq        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-    movdqa      krd, xmm5
-
-    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
-    movsxd      rdx, dword ptr arg(3)       ;output_pitch
-    movsxd      rcx, dword ptr arg(4)       ;output_height
+;-------------------------------------------------------------------------------
+%macro SUBPIX_VFILTER16 1
+cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+
+    mova          m4, [filterq]
+    SETUP_LOCAL_VARS
+%if ARCH_X86_64
+    %define      src1q r7
+    %define  sstride6q r8
+    %define dst_stride dstrideq
+%else
+    %define      src1q filterq
+    %define  sstride6q dstrideq
+    %define dst_stride dstridemp
+%endif
+    mov        src1q, srcq
+    add        src1q, sstrideq
+    lea    sstride6q, [sstrideq + sstrideq * 4]
+    add    sstride6q, sstrideq                   ;pitch * 6
 
 .loop:
-    prefetcht0  [rsi + 2 * rax -3]
-
-    movq        xmm0,   [rsi - 3]           ;load src data
-    movq        xmm4,   [rsi + 5]
-    movq        xmm6,   [rsi + 13]
-    punpcklqdq  xmm0,   xmm4
-    punpcklqdq  xmm4,   xmm6
-
-    movdqa      xmm7,   xmm0
-
-    punpcklbw   xmm7,   xmm7
-    punpckhbw   xmm0,   xmm0
-    movdqa      xmm1,   xmm0
-    movdqa      xmm2,   xmm0
-    movdqa      xmm3,   xmm0
-
-    palignr     xmm0,   xmm7, 1
-    palignr     xmm1,   xmm7, 5
-    pmaddubsw   xmm0,   k0k1
-    palignr     xmm2,   xmm7, 9
-    pmaddubsw   xmm1,   k2k3
-    palignr     xmm3,   xmm7, 13
-
-    pmaddubsw   xmm2,   k4k5
-    pmaddubsw   xmm3,   k6k7
-    paddsw      xmm0,   xmm3
-
-    movdqa      xmm3,   xmm4
-    punpcklbw   xmm3,   xmm3
-    punpckhbw   xmm4,   xmm4
-
-    movdqa      xmm5,   xmm4
-    movdqa      xmm6,   xmm4
-    movdqa      xmm7,   xmm4
-
-    palignr     xmm4,   xmm3, 1
-    palignr     xmm5,   xmm3, 5
-    palignr     xmm6,   xmm3, 9
-    palignr     xmm7,   xmm3, 13
-
-    movdqa      xmm3,   xmm1
-    pmaddubsw   xmm4,   k0k1
-    pmaxsw      xmm1,   xmm2
-    pmaddubsw   xmm5,   k2k3
-    pminsw      xmm2,   xmm3
-    pmaddubsw   xmm6,   k4k5
-    paddsw      xmm0,   xmm2
-    pmaddubsw   xmm7,   k6k7
-    paddsw      xmm0,   xmm1
-
-    paddsw      xmm4,   xmm7
-    movdqa      xmm7,   xmm5
-    pmaxsw      xmm5,   xmm6
-    pminsw      xmm6,   xmm7
-    paddsw      xmm4,   xmm6
-    paddsw      xmm4,   xmm5
-
-    paddsw      xmm0,   krd
-    paddsw      xmm4,   krd
-    psraw       xmm0,   7
-    psraw       xmm4,   7
-    packuswb    xmm0,   xmm0
-    packuswb    xmm4,   xmm4
-    punpcklqdq  xmm0,   xmm4
-%if %1
-    movdqa      xmm1,   [rdi]
-    pavgb       xmm0,   xmm1
+    movh          m0, [srcq                ]     ;A
+    movh          m1, [srcq + sstrideq     ]     ;B
+    movh          m2, [srcq + sstrideq * 2 ]     ;C
+    movh          m3, [src1q + sstrideq * 2]     ;D
+    movh          m4, [srcq + sstrideq * 4 ]     ;E
+    movh          m5, [src1q + sstrideq * 4]     ;F
+
+    punpcklbw     m0, m1                         ;A B
+    movh          m6, [srcq + sstride6q]         ;G
+    punpcklbw     m2, m3                         ;C D
+    movh          m7, [src1q + sstride6q]        ;H
+    punpcklbw     m4, m5                         ;E F
+    pmaddubsw     m0, k0k1
+    movh          m3, [srcq + 8]                 ;A
+    pmaddubsw     m2, k2k3
+    punpcklbw     m6, m7                         ;G H
+    movh          m5, [srcq + sstrideq + 8]      ;B
+    pmaddubsw     m4, k4k5
+    punpcklbw     m3, m5                         ;A B
+    movh          m7, [srcq + sstrideq * 2 + 8]  ;C
+    pmaddubsw     m6, k6k7
+    mova          m1, m2
+    movh          m5, [src1q + sstrideq * 2 + 8] ;D
+    pmaxsw        m2, m4
+    punpcklbw     m7, m5                         ;C D
+    pminsw        m4, m1
+    paddsw        m0, m6
+    pmaddubsw     m3, k0k1
+    movh          m1, [srcq + sstrideq * 4 + 8]  ;E
+    paddsw        m0, m4
+    pmaddubsw     m7, k2k3
+    movh          m6, [src1q + sstrideq * 4 + 8] ;F
+    punpcklbw     m1, m6                         ;E F
+    paddsw        m0, m2
+    paddsw        m0, krd
+    movh          m2, [srcq + sstride6q + 8]     ;G
+    pmaddubsw     m1, k4k5
+    movh          m5, [src1q + sstride6q + 8]    ;H
+    psraw         m0, 7
+    punpcklbw     m2, m5                         ;G H
+    packuswb      m0, m0
+    pmaddubsw     m2, k6k7
+%ifidn %1, v8_avg
+    movh          m4, [dstq]
+    pavgb         m0, m4
 %endif
-
-    lea         rsi,    [rsi + rax]
-    movdqa      [rdi],  xmm0
-
-    lea         rdi,    [rdi + rdx]
-    dec         rcx
-    jnz         .loop
+    movh      [dstq], m0
+    mova          m6, m7
+    pmaxsw        m7, m1
+    pminsw        m1, m6
+    paddsw        m3, m2
+    paddsw        m3, m1
+    paddsw        m3, m7
+    paddsw        m3, krd
+    psraw         m3, 7
+    packuswb      m3, m3
+
+    add         srcq, sstrideq
+    add        src1q, sstrideq
+%ifidn %1, v8_avg
+    movh          m1, [dstq + 8]
+    pavgb         m3, m1
+%endif
+    movh  [dstq + 8], m3
+    add         dstq, dst_stride
+    dec      heightd
+    jnz        .loop
+    RET
 %endm
 
-;void vpx_filter_block1d4_h8_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d4_h8_ssse3) PRIVATE
-sym(vpx_filter_block1d4_h8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 3
-    %define k0k1k4k5 [rsp + 16 * 0]
-    %define k2k3k6k7 [rsp + 16 * 1]
-    %define krd      [rsp + 16 * 2]
-
-    HORIZx4 0
-
-    add rsp, 16 * 3
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d8_h8_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d8_h8_ssse3) PRIVATE
-sym(vpx_filter_block1d8_h8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    HORIZx8 0
-
-    add rsp, 16*5
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block1d16_h8_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vpx_filter_block1d16_h8_ssse3) PRIVATE
-sym(vpx_filter_block1d16_h8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    HORIZx16 0
-
-    add rsp, 16*5
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d4_h8_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d4_h8_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 3
-    %define k0k1k4k5 [rsp + 16 * 0]
-    %define k2k3k6k7 [rsp + 16 * 1]
-    %define krd      [rsp + 16 * 2]
-
-    HORIZx4 1
-
-    add rsp, 16 * 3
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d8_h8_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d8_h8_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    HORIZx8 1
-
-    add rsp, 16*5
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vpx_filter_block1d16_h8_avg_ssse3) PRIVATE
-sym(vpx_filter_block1d16_h8_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    HORIZx16 1
-
-    add rsp, 16*5
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-SECTION_RODATA
-align 16
-shuf_t0t1:
-    db  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-align 16
-shuf_t2t3:
-    db  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-align 16
-shuf_t4t5:
-    db  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-align 16
-shuf_t6t7:
-    db  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+INIT_XMM ssse3
+SUBPIX_VFILTER16     v8
+SUBPIX_VFILTER16 v8_avg
+SUBPIX_VFILTER       v8, 8
+SUBPIX_VFILTER   v8_avg, 8
+SUBPIX_VFILTER       v8, 4
+SUBPIX_VFILTER   v8_avg, 4
diff --git a/libvpx/vpx_mem/vpx_mem.c b/libvpx/vpx_mem/vpx_mem.c
index c6f501a6..b98fe83c 100644
--- a/libvpx/vpx_mem/vpx_mem.c
+++ b/libvpx/vpx_mem/vpx_mem.c
@@ -93,11 +93,10 @@ void vpx_free(void *memblk) {
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void *vpx_memset16(void *dest, int val, size_t length) {
-  int i;
-  void *orig = dest;
-  uint16_t *dest16 = dest;
+  size_t i;
+  uint16_t *dest16 = (uint16_t *)dest;
   for (i = 0; i < length; i++)
     *dest16++ = val;
-  return orig;
+  return dest;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_ports/bitops.h b/libvpx/vpx_ports/bitops.h
index 0d3223e3..84ff3659 100644
--- a/libvpx/vpx_ports/bitops.h
+++ b/libvpx/vpx_ports/bitops.h
@@ -11,6 +11,8 @@
 #ifndef VPX_PORTS_BITOPS_H_
 #define VPX_PORTS_BITOPS_H_
 
+#include <assert.h>
+
 #include "vpx_ports/msvc.h"
 
 #ifdef _MSC_VER
@@ -25,10 +27,15 @@
 extern "C" {
 #endif
 
+// These versions of get_msb() are only valid when n != 0 because all
+// of the optimized versions are undefined when n == 0:
+// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
+
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
 static INLINE int get_msb(unsigned int n) {
+  assert(n != 0);
   return 31 ^ __builtin_clz(n);
 }
 #elif defined(USE_MSC_INTRINSICS)
@@ -36,6 +43,7 @@ static INLINE int get_msb(unsigned int n) {
 
 static INLINE int get_msb(unsigned int n) {
   unsigned long first_set_bit;
+  assert(n != 0);
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
@@ -47,6 +55,8 @@ static INLINE int get_msb(unsigned int n) {
   unsigned int value = n;
   int i;
 
+  assert(n != 0);
+
   for (i = 4; i >= 0; --i) {
     const int shift = (1 << i);
     const unsigned int x = value >> shift;
diff --git a/libvpx/vpx_scale/yv12config.h b/libvpx/vpx_scale/yv12config.h
index fd5d54ba..37b255d4 100644
--- a/libvpx/vpx_scale/yv12config.h
+++ b/libvpx/vpx_scale/yv12config.h
@@ -56,6 +56,9 @@ typedef struct yv12_buffer_config {
   int subsampling_y;
   unsigned int bit_depth;
   vpx_color_space_t color_space;
+  vpx_color_range_t color_range;
+  int render_width;
+  int render_height;
 
   int corrupted;
   int flags;
diff --git a/libvpx/vpx_util/endian_inl.h b/libvpx/vpx_util/endian_inl.h
index 6b177f17..37bdce1c 100644
--- a/libvpx/vpx_util/endian_inl.h
+++ b/libvpx/vpx_util/endian_inl.h
@@ -25,14 +25,10 @@
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif
 
-#ifdef __clang__
-# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
-# define LOCAL_CLANG_PREREQ(maj, min) \
-    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_CLANG_VERSION 0
-# define LOCAL_CLANG_PREREQ(maj, min) 0
-#endif  // __clang__
+// handle clang compatibility
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif
 
 // some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
 #if !defined(WORDS_BIGENDIAN) && \
@@ -53,14 +49,16 @@
 #define HToBE32(X) BSwap32(X)
 #endif
 
-// clang-3.3 and gcc-4.3 have builtin functions for swap32/swap64
-#if LOCAL_GCC_PREREQ(4, 3) || LOCAL_CLANG_PREREQ(3, 3)
+#if LOCAL_GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+#define HAVE_BUILTIN_BSWAP16
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
 #define HAVE_BUILTIN_BSWAP32
-#define HAVE_BUILTIN_BSWAP64
 #endif
-// clang-3.3 and gcc-4.8 have a builtin function for swap16
-#if LOCAL_GCC_PREREQ(4, 8) || LOCAL_CLANG_PREREQ(3, 3)
-#define HAVE_BUILTIN_BSWAP16
+
+#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+#define HAVE_BUILTIN_BSWAP64
 #endif
 
 #if HAVE_MIPS32 && defined(__mips__) && !defined(__mips64) && \
diff --git a/libvpx/vpxdec.c b/libvpx/vpxdec.c
index 3c61bd92..285d58e1 100644
--- a/libvpx/vpxdec.c
+++ b/libvpx/vpxdec.c
@@ -562,7 +562,7 @@ static int main_loop(int argc, const char **argv_) {
   int                     opt_i420 = 0;
   vpx_codec_dec_cfg_t     cfg = {0, 0, 0};
 #if CONFIG_VP9_HIGHBITDEPTH
-  int                     output_bit_depth = 0;
+  unsigned int            output_bit_depth = 0;
 #endif
 #if CONFIG_VP8_DECODER
   vp8_postproc_cfg_t      vp8_pp_cfg = {0};
@@ -618,9 +618,6 @@ static int main_loop(int argc, const char **argv_) {
       use_y4m = 0;
       flipuv = 1;
       opt_yv12 = 1;
-#if CONFIG_VP9_HIGHBITDEPTH
-      output_bit_depth = 8;  // For yv12 8-bit depth output is assumed
-#endif
     } else if (arg_match(&arg, &use_i420, argi)) {
       use_y4m = 0;
       flipuv = 0;
@@ -956,22 +953,22 @@ static int main_loop(int argc, const char **argv_) {
           // these is set to 0, use the display size set in the first frame
           // header. If that is unavailable, use the raw decoded size of the
           // first decoded frame.
-          int display_width = vpx_input_ctx.width;
-          int display_height = vpx_input_ctx.height;
-          if (!display_width || !display_height) {
-            int display_size[2];
+          int render_width = vpx_input_ctx.width;
+          int render_height = vpx_input_ctx.height;
+          if (!render_width || !render_height) {
+            int render_size[2];
             if (vpx_codec_control(&decoder, VP9D_GET_DISPLAY_SIZE,
-                                  display_size)) {
+                                  render_size)) {
               // As last resort use size of first frame as display size.
-              display_width = img->d_w;
-              display_height = img->d_h;
+              render_width = img->d_w;
+              render_height = img->d_h;
             } else {
-              display_width = display_size[0];
-              display_height = display_size[1];
+              render_width = render_size[0];
+              render_height = render_size[1];
             }
           }
-          scaled_img = vpx_img_alloc(NULL, img->fmt, display_width,
-                                     display_height, 16);
+          scaled_img = vpx_img_alloc(NULL, img->fmt, render_width,
+                                     render_height, 16);
           scaled_img->bit_depth = img->bit_depth;
         }
 
@@ -990,11 +987,11 @@ static int main_loop(int argc, const char **argv_) {
       }
 #if CONFIG_VP9_HIGHBITDEPTH
       // Default to codec bit depth if output bit depth not set
-      if (!output_bit_depth) {
+      if (!output_bit_depth && single_file && !do_md5) {
         output_bit_depth = img->bit_depth;
       }
       // Shift up or down if necessary
-      if (output_bit_depth != img->bit_depth) {
+      if (output_bit_depth != 0 && output_bit_depth != img->bit_depth) {
         const vpx_img_fmt_t shifted_fmt = output_bit_depth == 8 ?
             img->fmt ^ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) :
             img->fmt | VPX_IMG_FMT_HIGHBITDEPTH;
diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c
index 06604ea0..cb78226b 100644
--- a/libvpx/vpxenc.c
+++ b/libvpx/vpxenc.c
@@ -1996,7 +1996,7 @@ int main(int argc, const char **argv_) {
     usage_exit();
 
   /* Decide if other chroma subsamplings than 4:2:0 are supported */
-  if (global.codec->fourcc == VP9_FOURCC)
+  if (global.codec->fourcc == VP9_FOURCC || global.codec->fourcc == VP10_FOURCC)
     input.only_i420 = 0;
 
   for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
diff --git a/libvpx/webmdec.cc b/libvpx/webmdec.cc
index 1020d046..f541cfec 100644
--- a/libvpx/webmdec.cc
+++ b/libvpx/webmdec.cc
@@ -94,7 +94,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
     }
   }
 
-  if (video_track == NULL) {
+  if (video_track == NULL || video_track->GetCodecId() == NULL) {
     rewind_and_reset(webm_ctx, vpx_ctx);
     return 0;
   }
diff --git a/update_libvpx.sh b/update_libvpx.sh
index 92e40ebe..4e41bf52 100755
--- a/update_libvpx.sh
+++ b/update_libvpx.sh
@@ -10,7 +10,7 @@
 # Usage:
 #
 # $ ./update_libvpx.sh [branch | revision | file or url containing a revision]
-# When specifying a branch it must be prefixed with origin/
+# When specifying a branch it may be necessary to prefix with origin/
 
 # Tools required for running this tool:
 #
@@ -113,7 +113,7 @@ rm -rf .git .gitignore .gitattributes
 
 # Add and remove files.
 echo "$add" | xargs -I {} git add {}
-echo "$delete" | xargs -I {} git rm {}
+echo "$delete" | xargs -I {} git rm --ignore-unmatch {}
 
 # Find empty directories and remove them.
 find . -type d -empty -exec git rm {} \;
author	Bill Yi <byi@google.com>	2016-02-22 19:24:59 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	2016-02-22 19:24:59 +0000
commit	857bb8df092ee86783ab6933063a736929a07227 (patch)
tree	f42181486e87a18dba9945956209fae0366172cb
parent	30dc5b6cbc88d67b24843b52c282e13f070b4ebc (diff)
parent	c927526be9a7b72fb5edb3f29c4e8ceabe0ec98a (diff)
download	platform_external_libvpx-brillo-m10-dev.tar.gz platform_external_libvpx-brillo-m10-dev.tar.bz2 platform_external_libvpx-brillo-m10-dev.zip