155 files changed, 23894 insertions, 8462 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
index 3e3e400..0b9fc09 100644
--- a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -11,45 +11,47 @@
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
 
-extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
-                                               int16_t *output,
-                                               int output_stride);
-extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
-                                               int16_t *output,
-                                               int16_t *pass1Output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest,
-                                               int dest_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input,
-                                               int16_t *output,
-                                               int output_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
-                                               int16_t *output,
-                                               int16_t *pass1Output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest,
-                                               int dest_stride);
-extern void save_neon_registers();
-extern void restore_neon_registers();
-
-
-void vp9_short_idct16x16_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
+                                      int16_t *output,
+                                      int output_stride);
+void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
+                                      int16_t *output,
+                                      int16_t *pass1Output,
+                                      int16_t skip_adding,
+                                      uint8_t *dest,
+                                      int dest_stride);
+void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
+                                     int16_t *output,
+                                     int output_stride);
+void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
+                                     int16_t *output,
+                                     int16_t *pass1Output,
+                                     int16_t skip_adding,
+                                     uint8_t *dest,
+                                     int dest_stride);
+
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
+extern void vp9_push_neon(int64_t *store);
+extern void vp9_pop_neon(int64_t *store);
+
+void vp9_idct16x16_256_add_neon(const int16_t *input,
+                                uint8_t *dest, int dest_stride) {
+  int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
 
   // save d8-d15 register values.
-  save_neon_registers();
+  vp9_push_neon(store_reg);
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_short_idct16x16_add_neon_pass2(input+1,
+  vp9_idct16x16_256_add_neon_pass2(input+1,
                                      row_idct_output,
                                      pass1_output,
                                      0,
@@ -59,12 +61,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
   /* Parallel idct on the lower 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
+  vp9_idct16x16_256_add_neon_pass2(input+8*16+1,
                                      row_idct_output+8,
                                      pass1_output,
                                      0,
@@ -74,12 +76,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
   /* Parallel idct on the left 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
                                      row_idct_output,
                                      pass1_output,
                                      1,
@@ -89,12 +91,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
   /* Parallel idct on the right 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
                                      row_idct_output+8,
                                      pass1_output,
                                      1,
@@ -102,28 +104,29 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
                                      dest_stride);
 
   // restore d8-d15 register values.
-  restore_neon_registers();
+  vp9_pop_neon(store_reg);
 
   return;
 }
 
-void vp9_short_idct10_16x16_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_10_add_neon(const int16_t *input,
+                               uint8_t *dest, int dest_stride) {
+  int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
 
   // save d8-d15 register values.
-  save_neon_registers();
+  vp9_push_neon(store_reg);
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8);
+  vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_short_idct10_16x16_add_neon_pass2(input+1,
+  vp9_idct16x16_10_add_neon_pass2(input+1,
                                         row_idct_output,
                                         pass1_output,
                                         0,
@@ -135,12 +138,12 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input,
   /* Parallel idct on the left 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
                                      row_idct_output,
                                      pass1_output,
                                      1,
@@ -150,12 +153,12 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input,
   /* Parallel idct on the right 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
                                      row_idct_output+8,
                                      pass1_output,
                                      1,
@@ -163,7 +166,7 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input,
                                      dest_stride);
 
   // restore d8-d15 register values.
-  restore_neon_registers();
+  vp9_pop_neon(store_reg);
 
   return;
 }
diff --git a/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c b/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c
deleted file mode 100644
index ceecd6f..0000000
--- a/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/common/vp9_common.h"
-
-// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
-extern void idct32_transpose_and_transform(int16_t *transpose_buffer,
-                                           int16_t *output, int16_t *input);
-extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
-
-
-// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
-extern void save_neon_registers();
-extern void restore_neon_registers();
-
-void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest,
-                                  int dest_stride) {
-  // TODO(cd): move the creation of these buffers within the ASM file
-  // internal buffer used to transpose 8 lines into before transforming them
-  int16_t transpose_buffer[32 * 8];
-  // results of the first pass (transpose and transform rows)
-  int16_t pass1[32 * 32];
-  // results of the second pass (transpose and transform columns)
-  int16_t pass2[32 * 32];
-
-  // save register we need to preserve
-  save_neon_registers();
-  // process rows
-  idct32_transpose_and_transform(transpose_buffer, pass1, input);
-  // process columns
-  // TODO(cd): do these two steps/passes within the ASM file
-  idct32_transpose_and_transform(transpose_buffer, pass2, pass1);
-  // combine and add to dest
-  // TODO(cd): integrate this within the last storage step of the second pass
-  idct32_combine_add(dest, pass2, dest_stride);
-  // restore register we need to preserve
-  restore_neon_registers();
-}
-
-// TODO(cd): Eliminate this file altogether when everything is in ASM file
diff --git a/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm
new file mode 100644
index 0000000..71c3e70
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm
@@ -0,0 +1,36 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_push_neon|
+    EXPORT  |vp9_pop_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_push_neon| PROC
+    vst1.i64            {d8, d9, d10, d11}, [r0]!
+    vst1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+|vp9_pop_neon| PROC
+    vld1.i64            {d8, d9, d10, d11}, [r0]!
+    vld1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+    END
+
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
index cf5c8f7..b1fd21b 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
@@ -8,21 +8,21 @@
 ;
 
 
-    EXPORT  |vp9_short_idct16x16_1_add_neon|
+    EXPORT  |vp9_idct16x16_1_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
 ;                                    int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct16x16_1_add_neon| PROC
+|vp9_idct16x16_1_add_neon| PROC
     ldrsh            r0, [r0]
 
     ; generate cospi_16_64 = 11585
@@ -193,6 +193,6 @@
     vst1.64          {d31}, [r12], r2
 
     bx               lr
-    ENDP             ; |vp9_short_idct16x16_1_add_neon|
+    ENDP             ; |vp9_idct16x16_1_add_neon|
 
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
index 7464e80..a13c0d0 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
@@ -8,12 +8,10 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_short_idct16x16_add_neon_pass1|
-    EXPORT  |vp9_short_idct16x16_add_neon_pass2|
-    EXPORT  |vp9_short_idct10_16x16_add_neon_pass1|
-    EXPORT  |vp9_short_idct10_16x16_add_neon_pass2|
-    EXPORT  |save_neon_registers|
-    EXPORT  |restore_neon_registers|
+    EXPORT  |vp9_idct16x16_256_add_neon_pass1|
+    EXPORT  |vp9_idct16x16_256_add_neon_pass2|
+    EXPORT  |vp9_idct16x16_10_add_neon_pass1|
+    EXPORT  |vp9_idct16x16_10_add_neon_pass2|
     ARM
     REQUIRE8
     PRESERVE8
@@ -38,7 +36,7 @@
     MEND
 
     AREA    Block, CODE, READONLY ; name this block of code
-;void |vp9_short_idct16x16_add_neon_pass1|(int16_t *input,
+;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input,
 ;                                          int16_t *output, int output_stride)
 ;
 ; r0  int16_t input
@@ -48,7 +46,7 @@
 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_short_idct16x16_add_neon_pass1| PROC
+|vp9_idct16x16_256_add_neon_pass1| PROC
 
     ; TODO(hkuang): Find a better way to load the elements.
     ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -275,9 +273,9 @@
     vst1.64         {d31}, [r1], r2
 
     bx              lr
-    ENDP  ; |vp9_short_idct16x16_add_neon_pass1|
+    ENDP  ; |vp9_idct16x16_256_add_neon_pass1|
 
-;void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
+;void vp9_idct16x16_256_add_neon_pass2(int16_t *src,
 ;                                        int16_t *output,
 ;                                        int16_t *pass1Output,
 ;                                        int16_t skip_adding,
@@ -294,7 +292,7 @@
 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_short_idct16x16_add_neon_pass2| PROC
+|vp9_idct16x16_256_add_neon_pass2| PROC
     push            {r3-r9}
 
     ; TODO(hkuang): Find a better way to load the elements.
@@ -786,9 +784,9 @@ skip_adding_dest
 end_idct16x16_pass2
     pop             {r3-r9}
     bx              lr
-    ENDP  ; |vp9_short_idct16x16_add_neon_pass2|
+    ENDP  ; |vp9_idct16x16_256_add_neon_pass2|
 
-;void |vp9_short_idct10_16x16_add_neon_pass1|(int16_t *input,
+;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input,
 ;                                             int16_t *output, int output_stride)
 ;
 ; r0  int16_t input
@@ -798,7 +796,7 @@ end_idct16x16_pass2
 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_short_idct10_16x16_add_neon_pass1| PROC
+|vp9_idct16x16_10_add_neon_pass1| PROC
 
     ; TODO(hkuang): Find a better way to load the elements.
     ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -907,9 +905,9 @@ end_idct16x16_pass2
     vst1.64         {d31}, [r1], r2
 
     bx              lr
-    ENDP  ; |vp9_short_idct10_16x16_add_neon_pass1|
+    ENDP  ; |vp9_idct16x16_10_add_neon_pass1|
 
-;void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
+;void vp9_idct16x16_10_add_neon_pass2(int16_t *src,
 ;                                           int16_t *output,
 ;                                           int16_t *pass1Output,
 ;                                           int16_t skip_adding,
@@ -926,7 +924,7 @@ end_idct16x16_pass2
 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_short_idct10_16x16_add_neon_pass2| PROC
+|vp9_idct16x16_10_add_neon_pass2| PROC
     push            {r3-r9}
 
     ; TODO(hkuang): Find a better way to load the elements.
@@ -1177,15 +1175,5 @@ end_idct16x16_pass2
 end_idct10_16x16_pass2
     pop             {r3-r9}
     bx              lr
-    ENDP  ; |vp9_short_idct10_16x16_add_neon_pass2|
-;void |save_neon_registers|()
-|save_neon_registers| PROC
-    vpush           {d8-d15}
-    bx              lr
-    ENDP  ; |save_registers|
-;void |restore_neon_registers|()
-|restore_neon_registers| PROC
-    vpop           {d8-d15}
-    bx             lr
-    ENDP  ; |restore_registers|
+    ENDP  ; |vp9_idct16x16_10_add_neon_pass2|
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
index 5c097cc..f00d027 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
@@ -43,8 +43,7 @@ cospi_30_64 EQU  1606
 cospi_31_64 EQU   804
 
 
-    EXPORT  |idct32_transpose_and_transform|
-    EXPORT  |idct32_combine_add|
+    EXPORT  |vp9_idct32x32_1024_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -100,6 +99,142 @@ cospi_31_64 EQU   804
     vst1.16 {$reg2}, [r1]
     MEND
     ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q6-q9 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_CENTER_RESULTS
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d8}, [r10], r2
+    vld1.s16        {d11}, [r9], r11
+    vld1.s16        {d9}, [r10]
+    vld1.s16        {d10}, [r9]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q8, q8, #6
+    vrshr.s16       q9, q9, #6
+    vrshr.s16       q6, q6, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q7, q7, d9
+    vaddw.u8        q8, q8, d10
+    vaddw.u8        q9, q9, d11
+    vaddw.u8        q6, q6, d8
+    ; clip pixel
+    vqmovun.s16     d9,  q7
+    vqmovun.s16     d10, q8
+    vqmovun.s16     d11, q9
+    vqmovun.s16     d8,  q6
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d9}, [r10], r11
+    vst1.16         {d10}, [r9], r2
+    vst1.16         {d8}, [r10]
+    vst1.16         {d11}, [r9]
+    ; update pointers (by dest_stride * 2)
+    sub r9,  r9,  r2, lsl #1
+    add r10, r10, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q6-q9 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_CENTER_RESULTS_LAST
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d8}, [r10], r2
+    vld1.s16        {d11}, [r9], r11
+    vld1.s16        {d9}, [r10]
+    vld1.s16        {d10}, [r9]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q8, q8, #6
+    vrshr.s16       q9, q9, #6
+    vrshr.s16       q6, q6, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q7, q7, d9
+    vaddw.u8        q8, q8, d10
+    vaddw.u8        q9, q9, d11
+    vaddw.u8        q6, q6, d8
+    ; clip pixel
+    vqmovun.s16     d9,  q7
+    vqmovun.s16     d10, q8
+    vqmovun.s16     d11, q9
+    vqmovun.s16     d8,  q6
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d9}, [r10], r11
+    vst1.16         {d10}, [r9], r2
+    vst1.16         {d8}, [r10]!
+    vst1.16         {d11}, [r9]!
+    ; update pointers (by dest_stride * 2)
+    sub r9,  r9,  r2, lsl #1
+    add r10, r10, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q4-q7 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_EXTREME_RESULTS
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d4}, [r7], r2
+    vld1.s16        {d7}, [r6], r11
+    vld1.s16        {d5}, [r7]
+    vld1.s16        {d6}, [r6]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q5, q5, #6
+    vrshr.s16       q6, q6, #6
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q4, q4, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q5, q5, d5
+    vaddw.u8        q6, q6, d6
+    vaddw.u8        q7, q7, d7
+    vaddw.u8        q4, q4, d4
+    ; clip pixel
+    vqmovun.s16     d5, q5
+    vqmovun.s16     d6, q6
+    vqmovun.s16     d7, q7
+    vqmovun.s16     d4, q4
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d5}, [r7], r11
+    vst1.16         {d6}, [r6], r2
+    vst1.16         {d7}, [r6]
+    vst1.16         {d4}, [r7]
+    ; update pointers (by dest_stride * 2)
+    sub r6, r6, r2, lsl #1
+    add r7, r7, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q4-q7 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_EXTREME_RESULTS_LAST
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d4}, [r7], r2
+    vld1.s16        {d7}, [r6], r11
+    vld1.s16        {d5}, [r7]
+    vld1.s16        {d6}, [r6]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q5, q5, #6
+    vrshr.s16       q6, q6, #6
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q4, q4, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q5, q5, d5
+    vaddw.u8        q6, q6, d6
+    vaddw.u8        q7, q7, d7
+    vaddw.u8        q4, q4, d4
+    ; clip pixel
+    vqmovun.s16     d5, q5
+    vqmovun.s16     d6, q6
+    vqmovun.s16     d7, q7
+    vqmovun.s16     d4, q4
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d5}, [r7], r11
+    vst1.16         {d6}, [r6], r2
+    vst1.16         {d7}, [r6]!
+    vst1.16         {d4}, [r7]!
+    ; update pointers (by dest_stride * 2)
+    sub r6, r6, r2, lsl #1
+    add r7, r7, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
     ; Touches q8-q12, q15 (q13-q14 are preserved)
     ; valid output registers are anything but q8-q11
     MACRO
@@ -110,12 +245,12 @@ cospi_31_64 EQU   804
     ;           additions/substractions before the multiplies.
     ; generate the constants
     ;   generate scalar constants
-    mov             r3,  #$first_constant  & 0xFF00
-    add             r3,  #$first_constant  & 0x00FF
+    mov             r8,  #$first_constant  & 0xFF00
     mov             r12, #$second_constant & 0xFF00
+    add             r8,  #$first_constant  & 0x00FF
     add             r12, #$second_constant & 0x00FF
     ;   generate vector constants
-    vdup.16         d30, r3
+    vdup.16         d30, r8
     vdup.16         d31, r12
     ; (used) two for inputs (regA-regD), one for constants (q15)
     ; do some multiplications (ordered for maximum latency hiding)
@@ -153,15 +288,22 @@ cospi_31_64 EQU   804
     MEND
     ; --------------------------------------------------------------------------
 
-;void idct32_transpose_and_transform(int16_t *transpose_buffer, int16_t *output, int16_t *input);
+;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
 ;
-; r0  int16_t *transpose_buffer
-; r1  int16_t *output
-; r2  int16_t *input)
-; TODO(cd): have more logical parameter ordering but this issue will disappear
-;           when functions are combined.
+;   r0  int16_t *input,
+;   r1  uint8_t *dest,
+;   r2  int dest_stride)
+; loop counters
+;   r4  bands loop counter
+;   r5  pass loop counter
+;   r8  transpose loop counter
+; combine-add pointers
+;   r6  dest + 31 * dest_stride, descending (30, 29, 28, ...)
+;   r7  dest +  0 * dest_stride, ascending  (1, 2, 3, ...)
+;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
+;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
 
-|idct32_transpose_and_transform| PROC
+|vp9_idct32x32_1024_add_neon| PROC
     ; This function does one pass of idct32x32 transform.
     ;
     ; This is done by transposing the input and then doing a 1d transform on
@@ -171,43 +313,73 @@ cospi_31_64 EQU   804
     ; The 1d transform is done by looping over bands of eight columns (the
     ; idct32_bands loop). For each band, the transform input transposition
     ; is done on demand, one band of four 8x8 matrices at a time. The four
-    ; matrices are trsnposed by pairs (the idct32_transpose_pair loop).
-    push {r4}
-    mov r4, #0          ; initialize bands loop counter
+    ; matrices are transposed by pairs (the idct32_transpose_pair loop).
+    push  {r4-r11}
+    vpush {d8-d15}
+    ; stack operation
+    ; internal buffer used to transpose 8 lines into before transforming them
+    ;   int16_t transpose_buffer[32 * 8];
+    ;   at sp + [4096, 4607]
+    ; results of the first pass (transpose and transform rows)
+    ;   int16_t pass1[32 * 32];
+    ;   at sp + [0, 2047]
+    ; results of the second pass (transpose and transform columns)
+    ;   int16_t pass2[32 * 32];
+    ;   at sp + [2048, 4095]
+    sub sp, sp, #512+2048+2048
+
+    ; r6  = dest + 31 * dest_stride
+    ; r7  = dest +  0 * dest_stride
+    ; r9  = dest + 15 * dest_stride
+    ; r10 = dest + 16 * dest_stride
+    rsb r6,  r2, r2, lsl #5
+    rsb r9,  r2, r2, lsl #4
+    add r10, r1, r2, lsl #4
+    mov r7, r1
+    add r6, r6, r1
+    add r9, r9, r1
+    ; r11 = -dest_stride
+    neg r11, r2
+    ; r3 = input
+    mov r3, r0
+    ; parameters for first pass
+      ; r0 = transpose_buffer[32 * 8]
+    add r0, sp, #4096
+      ; r1 = pass1[32 * 32]
+    mov r1, sp
+
+    mov r5, #0          ; initialize pass loop counter
+idct32_pass_loop
+    mov r4, #4          ; initialize bands loop counter
 idct32_bands_loop
-    ; TODO(cd) get rid of these push/pop by properly adjusting register
-    ;          content at end of loop
-    push {r0}
-    push {r1}
-    push {r2}
-    mov r3, #0          ; initialize transpose loop counter
+    mov r8, #2          ; initialize transpose loop counter
 idct32_transpose_pair_loop
     ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
     ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
     ; adjusted to 32 because of the two post-increments.
-    vld1.s16        {q8},  [r2]!
-    vld1.s16        {q0},  [r2]!
-    add r2, #32
-    vld1.s16        {q9},  [r2]!
-    vld1.s16        {q1},  [r2]!
-    add r2, #32
-    vld1.s16        {q10}, [r2]!
-    vld1.s16        {q2},  [r2]!
-    add r2, #32
-    vld1.s16        {q11}, [r2]!
-    vld1.s16        {q3},  [r2]!
-    add r2, #32
-    vld1.s16        {q12}, [r2]!
-    vld1.s16        {q4},  [r2]!
-    add r2, #32
-    vld1.s16        {q13}, [r2]!
-    vld1.s16        {q5},  [r2]!
-    add r2, #32
-    vld1.s16        {q14}, [r2]!
-    vld1.s16        {q6},  [r2]!
-    add r2, #32
-    vld1.s16        {q15}, [r2]!
-    vld1.s16        {q7},  [r2]!
+    vld1.s16        {q8},  [r3]!
+    vld1.s16        {q0},  [r3]!
+    add r3, #32
+    vld1.s16        {q9},  [r3]!
+    vld1.s16        {q1},  [r3]!
+    add r3, #32
+    vld1.s16        {q10}, [r3]!
+    vld1.s16        {q2},  [r3]!
+    add r3, #32
+    vld1.s16        {q11}, [r3]!
+    vld1.s16        {q3},  [r3]!
+    add r3, #32
+    vld1.s16        {q12}, [r3]!
+    vld1.s16        {q4},  [r3]!
+    add r3, #32
+    vld1.s16        {q13}, [r3]!
+    vld1.s16        {q5},  [r3]!
+    add r3, #32
+    vld1.s16        {q14}, [r3]!
+    vld1.s16        {q6},  [r3]!
+    add r3, #32
+    vld1.s16        {q15}, [r3]!
+    vld1.s16        {q7},  [r3]!
 
     ; Transpose the two 8x8 16bit data matrices.
     vswp            d17, d24
@@ -255,11 +427,13 @@ idct32_transpose_pair_loop
     vst1.16        {q7},  [r0]!
 
     ; increment pointers by adjusted stride (not necessary for r0/out)
-    sub r2,  r2,  #8*32*2-32-16*2
+    ;   go back by 7*32 for the seven lines moved fully by read and add
+    ;   go back by 32 for the eigth line only read
+    ;   advance by 16*2 to go the next pair
+    sub r3,  r3,  #7*32*2 + 32 - 16*2
     ; transpose pair loop processing
-    add r3, r3, #1
-    cmp r3, #1
-    BLE idct32_transpose_pair_loop
+    subs r8, r8, #1
+    bne idct32_transpose_pair_loop
 
     ; restore r0/input to its original value
     sub r0, r0, #32*8*2
@@ -815,21 +989,26 @@ idct32_transpose_pair_loop
     vadd.s16  q9, q5, q0
     vsub.s16  q6, q5, q0
     vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 17, 17, 16, q7, q6
-    STORE_IN_OUTPUT 16, 15, 14, q9, q8
+
+    cmp r5, #0
+    bgt idct32_bands_end_2nd_pass
+
+idct32_bands_end_1st_pass
+    STORE_IN_OUTPUT 17, 16, 17, q6, q7
+    STORE_IN_OUTPUT 17, 14, 15, q8, q9
     ; --------------------------------------------------------------------------
     ; part of final stage
     ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
     ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
     ;output[30 * 32] = step1b[1][i] - step1b[30][i];
     ;output[31 * 32] = step1b[0][i] - step1b[31][i];
-    LOAD_FROM_OUTPUT 14, 30, 31, q0, q1
+    LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
     vadd.s16  q4, q2, q1
     vadd.s16  q5, q3, q0
     vsub.s16  q6, q3, q0
     vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 31, 31, 30, q7, q6
-    STORE_IN_OUTPUT 30,  0,  1, q4, q5
+    STORE_IN_OUTPUT 31, 30, 31, q6, q7
+    STORE_IN_OUTPUT 31,  0,  1, q4, q5
     ; --------------------------------------------------------------------------
     ; part of stage 7
     ;step1[2] = step1b[2][i] + step1b[13][i];
@@ -848,25 +1027,25 @@ idct32_transpose_pair_loop
     ;output[18 * 32] = step1b[13][i] - step1b[18][i];
     ;output[19 * 32] = step1b[12][i] - step1b[19][i];
     LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
-    vadd.s16  q6, q4, q1
-    vadd.s16  q7, q5, q0
-    vsub.s16  q8, q5, q0
-    vsub.s16  q9, q4, q1
-    STORE_IN_OUTPUT 19, 19, 18, q9, q8
-    STORE_IN_OUTPUT 18, 13, 12, q7, q6
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 19, 18, 19, q6, q7
+    STORE_IN_OUTPUT 19, 12, 13, q8, q9
     ; --------------------------------------------------------------------------
     ; part of final stage
     ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
     ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
     ;output[28 * 32] = step1b[3][i] - step1b[28][i];
     ;output[29 * 32] = step1b[2][i] - step1b[29][i];
-    LOAD_FROM_OUTPUT 12, 28, 29, q0, q1
+    LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
     vadd.s16  q4, q2, q1
     vadd.s16  q5, q3, q0
     vsub.s16  q6, q3, q0
     vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 29, 29, 28, q7, q6
-    STORE_IN_OUTPUT 28,  2,  3, q4, q5
+    STORE_IN_OUTPUT 29, 28, 29, q6, q7
+    STORE_IN_OUTPUT 29,  2,  3, q4, q5
     ; --------------------------------------------------------------------------
     ; part of stage 7
     ;step1[4] = step1b[4][i] + step1b[11][i];
@@ -885,25 +1064,25 @@ idct32_transpose_pair_loop
     ;output[20 * 32] = step1b[11][i] - step1b[20][i];
     ;output[21 * 32] = step1b[10][i] - step1b[21][i];
     LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
-    vadd.s16  q6, q4, q1
-    vadd.s16  q7, q5, q0
-    vsub.s16  q8, q5, q0
-    vsub.s16  q9, q4, q1
-    STORE_IN_OUTPUT 21, 21, 20, q9, q8
-    STORE_IN_OUTPUT 20, 11, 10, q7, q6
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 21, 20, 21, q6, q7
+    STORE_IN_OUTPUT 21, 10, 11, q8, q9
     ; --------------------------------------------------------------------------
     ; part of final stage
     ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
     ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
     ;output[26 * 32] = step1b[5][i] - step1b[26][i];
     ;output[27 * 32] = step1b[4][i] - step1b[27][i];
-    LOAD_FROM_OUTPUT 10, 26, 27, q0, q1
+    LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
     vadd.s16  q4, q2, q1
     vadd.s16  q5, q3, q0
     vsub.s16  q6, q3, q0
     vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 27, 27, 26, q7, q6
-    STORE_IN_OUTPUT 26,  4,  5, q4, q5
+    STORE_IN_OUTPUT 27, 26, 27, q6, q7
+    STORE_IN_OUTPUT 27,  4,  5, q4, q5
     ; --------------------------------------------------------------------------
     ; part of stage 7
     ;step1[6] = step1b[6][i] + step1b[9][i];
@@ -922,92 +1101,199 @@ idct32_transpose_pair_loop
     ;output[22 * 32] = step1b[9][i] - step1b[22][i];
     ;output[23 * 32] = step1b[8][i] - step1b[23][i];
     LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
-    vadd.s16  q6, q4, q1
-    vadd.s16  q7, q5, q0
-    vsub.s16  q8, q5, q0
-    vsub.s16  q9, q4, q1
-    STORE_IN_OUTPUT 23, 23, 22, q9, q8
-    STORE_IN_OUTPUT 22, 9, 8, q7, q6
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 23, 22, 23, q6, q7
+    STORE_IN_OUTPUT 23, 8, 9, q8, q9
     ; --------------------------------------------------------------------------
     ; part of final stage
     ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
     ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
     ;output[24 * 32] = step1b[7][i] - step1b[24][i];
     ;output[25 * 32] = step1b[6][i] - step1b[25][i];
-    LOAD_FROM_OUTPUT 8, 24, 25, q0, q1
+    LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
     vadd.s16  q4, q2, q1
     vadd.s16  q5, q3, q0
     vsub.s16  q6, q3, q0
     vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 25, 25, 24, q7, q6
-    STORE_IN_OUTPUT 24,  6,  7, q4, q5
-    ; --------------------------------------------------------------------------
+    STORE_IN_OUTPUT 25, 24, 25, q6, q7
+    STORE_IN_OUTPUT 25,  6,  7, q4, q5
 
-    ; TODO(cd) get rid of these push/pop by properly adjusting register
-    ;          content at end of loop
-    pop {r2}
-    pop {r1}
-    pop {r0}
-    add r1, r1, #8*2
-    add r2, r2, #8*32*2
+    ; restore r0 by removing the last offset from the last
+    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+    sub r0, r0, #24*8*2
+    ; restore r1 by removing the last offset from the last
+    ;     operation (STORE_IN_OUTPUT 24,  6,  7) => 7*32*2
+    ; advance by 8 columns => 8*2
+    sub r1, r1, #7*32*2 - 8*2
+    ;   advance by 8 lines (8*32*2)
+    ;   go back by the two pairs from the loop (32*2)
+    add r3, r3, #8*32*2 - 32*2
 
     ; bands loop processing
-    add r4, r4, #1
-    cmp r4, #3
-    BLE idct32_bands_loop
+    subs r4, r4, #1
+    bne idct32_bands_loop
 
-    pop {r4}
-    bx              lr
-    ENDP  ; |idct32_transpose_and_transform|
+    ; parameters for second pass
+    ; the input of pass2 is the result of pass1. we have to remove the offset
+    ;   of 32 columns induced by the above idct32_bands_loop
+    sub r3, r1, #32*2
+      ; r1 = pass2[32 * 32]
+    add r1, sp, #2048
 
-;void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
-;
-; r0  uint8_t *dest
-; r1  int16_t *out
-; r2  int dest_stride)
-
-|idct32_combine_add| PROC
-
-    mov r12, r0         ; dest pointer used for stores
-    sub r2, r2, #32     ; adjust the stride (remove the post-increments)
-    mov r3, #0          ; initialize loop counter
-
-idct32_combine_add_loop
-    ; load out[j * 32 + 0-31]
-    vld1.s16        {q12},  [r1]!
-    vld1.s16        {q13},  [r1]!
-    vld1.s16        {q14},  [r1]!
-    vld1.s16        {q15},  [r1]!
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {q6}, [r0]!
-    vld1.s16        {q7}, [r0]!
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q12, q12, #6
-    vrshr.s16       q13, q13, #6
-    vrshr.s16       q14, q14, #6
-    vrshr.s16       q15, q15, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q12, q12, d12
-    vaddw.u8        q13, q13, d13
-    vaddw.u8        q14, q14, d14
-    vaddw.u8        q15, q15, d15
-    ; clip pixel
-    vqmovun.s16     d12, q12
-    vqmovun.s16     d13, q13
-    vqmovun.s16     d14, q14
-    vqmovun.s16     d15, q15
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {q6}, [r12]!
-    vst1.16         {q7}, [r12]!
-    ; increment pointers by adjusted stride (not necessary for r1/out)
-    add r0,  r0,  r2
-    add r12, r12, r2
-    ; loop processing
-    add r3, r3, #1
-    cmp r3, #31
-    BLE idct32_combine_add_loop
+    ; pass loop processing
+    add r5, r5, #1
+    B idct32_pass_loop
 
-    bx              lr
-    ENDP  ; |idct32_transpose|
+idct32_bands_end_2nd_pass
+    STORE_COMBINE_CENTER_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+    LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[2] = step1b[2][i] + step1b[13][i];
+    ;step1[3] = step1b[3][i] + step1b[12][i];
+    ;step1[12] = step1b[3][i] - step1b[12][i];
+    ;step1[13] = step1b[2][i] - step1b[13][i];
+    LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
+    vadd.s16  q2, q10, q1
+    vadd.s16  q3, q11, q0
+    vsub.s16  q4, q11, q0
+    vsub.s16  q5, q10, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_COMBINE_CENTER_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+    LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[4] = step1b[4][i] + step1b[11][i];
+    ;step1[5] = step1b[5][i] + step1b[10][i];
+    ;step1[10] = step1b[5][i] - step1b[10][i];
+    ;step1[11] = step1b[4][i] - step1b[11][i];
+    LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
+    vadd.s16  q2, q12, q1
+    vadd.s16  q3, q13, q0
+    vsub.s16  q4, q13, q0
+    vsub.s16  q5, q12, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_COMBINE_CENTER_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+    LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[6] = step1b[6][i] + step1b[9][i];
+    ;step1[7] = step1b[7][i] + step1b[8][i];
+    ;step1[8] = step1b[7][i] - step1b[8][i];
+    ;step1[9] = step1b[6][i] - step1b[9][i];
+    LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
+    vadd.s16  q2, q14, q1
+    vadd.s16  q3, q15, q0
+    vsub.s16  q4, q15, q0
+    vsub.s16  q5, q14, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_COMBINE_CENTER_RESULTS_LAST
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+    LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS_LAST
+    ; --------------------------------------------------------------------------
+    ; restore pointers to their initial indices for next band pass by
+    ;     removing/adding dest_stride * 8. The actual increment by eight
+    ;     is taken care of within the _LAST macros.
+    add r6,  r6,  r2, lsl #3
+    add r9,  r9,  r2, lsl #3
+    sub r7,  r7,  r2, lsl #3
+    sub r10, r10, r2, lsl #3
+
+    ; restore r0 by removing the last offset from the last
+    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+    sub r0, r0, #24*8*2
+    ; restore r1 by removing the last offset from the last
+    ;     operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
+    ; advance by 8 columns => 8*2
+    sub r1, r1, #25*32*2 - 8*2
+    ;   advance by 8 lines (8*32*2)
+    ;   go back by the two pairs from the loop (32*2)
+    add r3, r3, #8*32*2 - 32*2
 
+    ; bands loop processing
+    subs r4, r4, #1
+    bne idct32_bands_loop
+
+    ; stack operation
+    add sp, sp, #512+2048+2048
+    vpop {d8-d15}
+    pop  {r4-r11}
+    bx              lr
+    ENDP  ; |vp9_idct32x32_1024_add_neon|
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
index 869ee5f..0d4a721 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
@@ -8,21 +8,21 @@
 ;
 
 
-    EXPORT  |vp9_short_idct4x4_1_add_neon|
+    EXPORT  |vp9_idct4x4_1_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
 ;                                  int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct4x4_1_add_neon| PROC
+|vp9_idct4x4_1_add_neon| PROC
     ldrsh            r0, [r0]
 
     ; generate cospi_16_64 = 11585
@@ -63,6 +63,6 @@
     vst1.32          {d7[1]}, [r12]
 
     bx               lr
-    ENDP             ; |vp9_short_idct4x4_1_add_neon|
+    ENDP             ; |vp9_idct4x4_1_add_neon|
 
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
index 640fb93..00283fc 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
@@ -8,7 +8,7 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_short_idct4x4_add_neon|
+    EXPORT  |vp9_idct4x4_16_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -16,13 +16,13 @@
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
     AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct4x4_add_neon| PROC
+|vp9_idct4x4_16_add_neon| PROC
 
     ; The 2D transform is done with two passes which are actually pretty
     ; similar. We first transform the rows. This is done by transposing
@@ -185,6 +185,6 @@
     vst1.32 {d26[1]}, [r1], r2
     vst1.32 {d26[0]}, [r1]  ; no post-increment
     bx              lr
-    ENDP  ; |vp9_short_idct4x4_add_neon|
+    ENDP  ; |vp9_idct4x4_16_add_neon|
 
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
index 923804f..421d202 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
@@ -8,21 +8,21 @@
 ;
 
 
-    EXPORT  |vp9_short_idct8x8_1_add_neon|
+    EXPORT  |vp9_idct8x8_1_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
 ;                                  int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct8x8_1_add_neon| PROC
+|vp9_idct8x8_1_add_neon| PROC
     ldrsh            r0, [r0]
 
     ; generate cospi_16_64 = 11585
@@ -83,6 +83,6 @@
     vst1.64          {d31}, [r12], r2
 
     bx               lr
-    ENDP             ; |vp9_short_idct8x8_1_add_neon|
+    ENDP             ; |vp9_idct8x8_1_add_neon|
 
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
index a744f59..5476400 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -8,8 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_short_idct8x8_add_neon|
-    EXPORT  |vp9_short_idct10_8x8_add_neon|
+    EXPORT  |vp9_idct8x8_64_add_neon|
+    EXPORT  |vp9_idct8x8_10_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -198,13 +198,13 @@
     MEND
 
     AREA    Block, CODE, READONLY ; name this block of code
-;void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct8x8_add_neon| PROC
+|vp9_idct8x8_64_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
     vld1.s16        {q8,q9}, [r0]!
@@ -308,15 +308,15 @@
     vpop            {d8-d15}
     pop             {r4-r9}
     bx              lr
-    ENDP  ; |vp9_short_idct8x8_add_neon|
+    ENDP  ; |vp9_idct8x8_64_add_neon|
 
-;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct10_8x8_add_neon| PROC
+|vp9_idct8x8_10_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
     vld1.s16        {q8,q9}, [r0]!
@@ -514,6 +514,6 @@
     vpop            {d8-d15}
     pop             {r4-r9}
     bx              lr
-    ENDP  ; |vp9_short_idct10_8x8_add_neon|
+    ENDP  ; |vp9_idct8x8_10_add_neon|
 
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
index 963ef35..2f326e2 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
@@ -8,7 +8,7 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_short_iht4x4_add_neon|
+    EXPORT  |vp9_iht4x4_16_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -139,7 +139,7 @@
     MEND
 
     AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
 ;                               int dest_stride, int tx_type)
 ;
 ; r0  int16_t input
@@ -147,7 +147,7 @@
 ; r2  int dest_stride
 ; r3  int tx_type)
 ; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht4x4_add_neon| PROC
+|vp9_iht4x4_16_add_neon| PROC
 
     ; load the inputs into d16-d19
     vld1.s16    {q8,q9}, [r0]!
@@ -175,7 +175,7 @@ iadst_idct
     ; then transform columns
     IADST4x4_1D
 
-    b end_vp9_short_iht4x4_add_neon
+    b end_vp9_iht4x4_16_add_neon
 
 idct_iadst
     ; generate constants
@@ -191,7 +191,7 @@ idct_iadst
     ; then transform columns
     IDCT4x4_1D
 
-    b end_vp9_short_iht4x4_add_neon
+    b end_vp9_iht4x4_16_add_neon
 
 iadst_iadst
     ; generate constants
@@ -206,7 +206,7 @@ iadst_iadst
     ; then transform columns
     IADST4x4_1D
 
-end_vp9_short_iht4x4_add_neon
+end_vp9_iht4x4_16_add_neon
     ; ROUND_POWER_OF_TWO(temp_out[j], 4)
     vrshr.s16   q8, q8, #4
     vrshr.s16   q9, q9, #4
@@ -232,6 +232,6 @@ end_vp9_short_iht4x4_add_neon
     vst1.32     {d26[1]}, [r1], r2
     vst1.32     {d26[0]}, [r1]  ; no post-increment
     bx          lr
-    ENDP  ; |vp9_short_iht4x4_add_neon|
+    ENDP  ; |vp9_iht4x4_16_add_neon|
 
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
index bab9cb4..93d3af3 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
@@ -8,7 +8,7 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_short_iht8x8_add_neon|
+    EXPORT  |vp9_iht8x8_64_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -559,7 +559,7 @@
 
 
     AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
 ;                               int dest_stride, int tx_type)
 ;
 ; r0  int16_t input
@@ -567,7 +567,7 @@
 ; r2  int dest_stride
 ; r3  int tx_type)
 ; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht8x8_add_neon| PROC
+|vp9_iht8x8_64_add_neon| PROC
 
     ; load the inputs into d16-d19
     vld1.s16        {q8,q9}, [r0]!
@@ -602,7 +602,7 @@ iadst_idct
     ; then transform columns
     IADST8X8_1D
 
-    b end_vp9_short_iht8x8_add_neon
+    b end_vp9_iht8x8_64_add_neon
 
 idct_iadst
     ; generate IADST constants
@@ -620,7 +620,7 @@ idct_iadst
     ; then transform columns
     IDCT8x8_1D
 
-    b end_vp9_short_iht8x8_add_neon
+    b end_vp9_iht8x8_64_add_neon
 
 iadst_iadst
     ; generate IADST constants
@@ -635,7 +635,7 @@ iadst_iadst
     ; then transform columns
     IADST8X8_1D
 
-end_vp9_short_iht8x8_add_neon
+end_vp9_iht8x8_64_add_neon
     pop            {r0-r10}
 
     ; ROUND_POWER_OF_TWO(temp_out[j], 5)
@@ -691,6 +691,6 @@ end_vp9_short_iht8x8_add_neon
     vst1.64         {d6}, [r0], r2
     vst1.64         {d7}, [r0], r2
     bx          lr
-    ENDP  ; |vp9_short_iht8x8_add_neon|
+    ENDP  ; |vp9_iht8x8_64_add_neon|
 
     END
diff --git a/libvpx/vp9/common/generic/vp9_systemdependent.c b/libvpx/vp9/common/generic/vp9_systemdependent.c
index f144721..536febb 100644
--- a/libvpx/vp9/common/generic/vp9_systemdependent.c
+++ b/libvpx/vp9/common/generic/vp9_systemdependent.c
@@ -10,7 +10,7 @@
 
 
 #include "./vpx_config.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
 void vp9_machine_specific_config(VP9_COMMON *cm) {
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h
new file mode 100644
index 0000000..644264f
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_COMMON_DSPR2_H_
+#define VP9_COMMON_VP9_COMMON_DSPR2_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+
+#if HAVE_DSPR2
+#define CROP_WIDTH 512
+extern uint8_t *vp9_ff_cropTbl;
+
+#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                    ({   \
+                                                                               \
+  int32_t tmp, out;                                                            \
+  int     dct_cost_rounding = DCT_CONST_ROUNDING;                              \
+  int     in = input;                                                          \
+                                                                               \
+  __asm__ __volatile__ (                                                       \
+      /* out = dct_const_round_shift(input_dc * cospi_16_64); */               \
+      "mtlo     %[dct_cost_rounding],   $ac1                              \n\t"\
+      "mthi     $zero,                  $ac1                              \n\t"\
+      "madd     $ac1,                   %[in],            %[cospi_16_64]  \n\t"\
+      "extp     %[tmp],                 $ac1,             31              \n\t"\
+                                                                               \
+      /* out = dct_const_round_shift(out * cospi_16_64); */                    \
+      "mtlo     %[dct_cost_rounding],   $ac2                              \n\t"\
+      "mthi     $zero,                  $ac2                              \n\t"\
+      "madd     $ac2,                   %[tmp],           %[cospi_16_64]  \n\t"\
+      "extp     %[out],                 $ac2,             31              \n\t"\
+                                                                               \
+      : [tmp] "=&r" (tmp), [out] "=r" (out)                                    \
+      : [in] "r" (in),                                                         \
+        [dct_cost_rounding] "r" (dct_cost_rounding),                           \
+        [cospi_16_64] "r" (cospi_16_64)                                        \
+   );                                                                          \
+  out;                                                                    })
+
+static INLINE void vp9_prefetch_load(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   0,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void vp9_prefetch_store(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   1,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+
+static INLINE void vp9_prefetch_load_streamed(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   4,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   5,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+
+void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                      int dest_stride);
+
+void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h);
+
+void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h);
+
+void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h);
+
+void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter,
+                         int w, int h);
+
+void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h);
+
+#endif  // #if HAVE_DSPR2
+#endif  // VP9_COMMON_VP9_COMMON_DSPR2_H_
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
new file mode 100644
index 0000000..91d62bc
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
@@ -0,0 +1,281 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
+                                         int32_t src_stride,
+                                         uint8_t *dst,
+                                         int32_t dst_stride,
+                                         const int16_t *filter_y,
+                                         int32_t w,
+                                         int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vp9_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2;
+  uint32_t      p1, p2;
+  uint32_t      scratch1, scratch2;
+  uint32_t      store1, store2;
+  int32_t       Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride), [cm] "r" (cm),
+            [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int16_t *filter_y,
+                                          int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vp9_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2;
+  uint32_t      p1, p2;
+  uint32_t      scratch1, scratch2;
+  uint32_t      store1, store2;
+  int32_t       Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+    vp9_prefetch_store(dst + dst_stride + 32);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride), [cm] "r" (cm),
+            [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h) {
+  if (16 == y_step_q4) {
+    uint32_t pos = 38;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
+
+    vp9_prefetch_store(dst);
+
+    switch (w) {
+      case 4:
+      case 8:
+      case 16:
+      case 32:
+        convolve_bi_avg_vert_4_dspr2(src, src_stride,
+                                     dst, dst_stride,
+                                     filter_y, w, h);
+        break;
+      case 64:
+        vp9_prefetch_store(dst + 32);
+        convolve_bi_avg_vert_64_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_y, h);
+        break;
+      default:
+        vp9_convolve8_avg_vert_c(src, src_stride,
+                                 dst, dst_stride,
+                                 filter_x, x_step_q4,
+                                 filter_y, y_step_q4,
+                                 w, h);
+        break;
+    }
+  } else {
+    vp9_convolve8_avg_vert_c(src, src_stride,
+                             dst, dst_stride,
+                             filter_x, x_step_q4,
+                             filter_y, y_step_q4,
+                             w, h);
+  }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
new file mode 100644
index 0000000..148b20f
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
@@ -0,0 +1,833 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int16_t *filter_x0,
+                                          int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  int32_t  Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3;
+  uint32_t tn1, tn2;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p3],          %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
+        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
+        "dpa.w.ph         $ac2,           %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp4],       $ac2,           31             \n\t"
+
+        "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
+        "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
+
+        /* clamp */
+        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
+        "lbux             %[p3],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
+        "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
+
+        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
+        "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
+
+        "addqh_r.w        %[p2],          %[p2],          %[p3]          \n\t"  /* average odd 2 */
+        "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
+                                         int32_t src_stride,
+                                         uint8_t *dst,
+                                         int32_t dst_stride,
+                                         const int16_t *filter_x0,
+                                         int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3, tp4;
+  uint32_t p1, p2, p3, p4, n1;
+  uint32_t st0, st1;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
+        "ulw              %[tp3],         8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+        "lbu              %[Temp2],       0(%[dst])                      \n\t"
+        "lbu              %[tp4],         2(%[dst])                      \n\t"
+
+        /* even 2. pixel */
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac1                           \n\t"
+        "mthi             $zero,          $ac1                           \n\t"
+        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac1,           31             \n\t"
+
+        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
+        "addqh_r.w        %[tp4],         %[tp4],         %[st1]         \n\t"
+        "sb               %[Temp2],       0(%[dst])                      \n\t"
+        "sb               %[tp4],         2(%[dst])                      \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+
+        "balign           %[tp3],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+
+        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
+        "lbu              %[Temp2],       4(%[dst])                      \n\t"
+        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
+
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a],    $ac1                           \n\t"
+        "mthi             $zero,          $ac1                           \n\t"
+        "sb               %[Temp2],       4(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp3]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp3]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tp1],         6(%[dst])                      \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],       $ac1,           31             \n\t"
+
+        "lbu              %[tp2],         1(%[dst])                      \n\t"
+        "lbu              %[tp3],         3(%[dst])                      \n\t"
+        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[filter45]    \n\t"
+        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tp4],         5(%[dst])                      \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[tp2],         1(%[dst])                      \n\t"
+        "sb               %[tp1],         6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac2,           31             \n\t"
+
+        "lbu              %[tp1],         7(%[dst])                      \n\t"
+
+        /* clamp */
+        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
+        "addqh_r.w        %[tp3],         %[tp3],         %[p4]          \n\t"
+
+        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
+        "addqh_r.w        %[tp4],         %[tp4],         %[p2]          \n\t"
+
+        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
+        "addqh_r.w        %[tp1],         %[tp1],         %[p1]          \n\t"
+
+        /* store bytes */
+        "sb               %[tp3],         3(%[dst])                      \n\t"
+        "sb               %[tp4],         5(%[dst])                      \n\t"
+        "sb               %[tp1],         7(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+          [st0] "=&r" (st0), [st1] "=&r" (st1),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+                                          int32_t src_stride,
+                                          uint8_t *dst_ptr,
+                                          int32_t dst_stride,
+                                          const int16_t *filter_x0,
+                                          int32_t h,
+                                          int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
+          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
+          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
+          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
+          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
+          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                   \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
+          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
+          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
+          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
+          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
+          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
+
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
+
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
+
+          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
+          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
+          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+                                          int32_t src_stride,
+                                          uint8_t *dst_ptr,
+                                          int32_t dst_stride,
+                                          const int16_t *filter_x0,
+                                          int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_load(src_ptr + src_stride + 64);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
+          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
+          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
+          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
+          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
+          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                   \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
+          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
+          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
+          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
+          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
+          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
+
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
+
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
+
+          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
+          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
+          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h) {
+  if (16 == x_step_q4) {
+    uint32_t pos = 38;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src);
+    vp9_prefetch_load(src + 32);
+    vp9_prefetch_store(dst);
+
+    switch (w) {
+      case 4:
+        convolve_bi_avg_horiz_4_dspr2(src, src_stride,
+                                     dst, dst_stride,
+                                     filter_x, h);
+        break;
+      case 8:
+        convolve_bi_avg_horiz_8_dspr2(src, src_stride,
+                                     dst, dst_stride,
+                                     filter_x, h);
+        break;
+      case 16:
+        convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_x, h, 1);
+        break;
+      case 32:
+        convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_x, h, 2);
+        break;
+      case 64:
+        vp9_prefetch_load(src + 64);
+        vp9_prefetch_store(dst + 32);
+
+        convolve_bi_avg_horiz_64_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_x, h);
+        break;
+      default:
+        vp9_convolve8_avg_horiz_c(src, src_stride,
+                                  dst, dst_stride,
+                                  filter_x, x_step_q4,
+                                  filter_y, y_step_q4,
+                                  w, h);
+        break;
+    }
+  } else {
+    vp9_convolve8_avg_horiz_c(src, src_stride,
+                              dst, dst_stride,
+                              filter_x, x_step_q4,
+                              filter_y, y_step_q4,
+                              w, h);
+  }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
new file mode 100644
index 0000000..92644f2
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
@@ -0,0 +1,784 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const int16_t *filter_x0,
+                                                 int32_t h) {
+  int32_t       y;
+  uint8_t       *cm = vp9_ff_cropTbl;
+  uint8_t       *dst_ptr;
+  int32_t       Temp1, Temp2;
+  uint32_t      vector4a = 64;
+  uint32_t      tp1, tp2;
+  uint32_t      p1, p2;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    dst_ptr = dst;
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
+        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[p1],          0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [dst_ptr] "+r" (dst_ptr)
+        : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+          [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const int16_t *filter_x0,
+                                                 int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *dst_ptr;
+  uint32_t vector4a = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3;
+  uint32_t p1, p2, p3, p4;
+  uint8_t *odd_dst;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  const int16_t *filter = &filter_x0[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+
+    dst_ptr = dst;
+    odd_dst = (dst_ptr + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                       \n\t"
+        "ulw              %[tp2],         4(%[src])                       \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
+        "ulw              %[tp3],         8(%[src])                       \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
+        "extp             %[Temp1],       $ac3,           31              \n\t"
+
+        /* even 2. pixel */
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "balign           %[tp3],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
+        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
+        "extp             %[p3],          $ac1,           31              \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        "lbux             %[Temp1],         %[p3](%[cm])                    \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
+        "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
+        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "extp             %[Temp3],       $ac1,           31              \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
+        "extp             %[Temp1],       $ac2,           31              \n\t"
+
+        /* clamp */
+        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
+        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
+        "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
+
+        /* store bytes */
+        "sb               %[p4],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[p2],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[p1],          0(%[odd_dst])                   \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
+        : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
+          [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst_ptr,
+                                                  int32_t dst_stride,
+                                                  const int16_t *filter_x0,
+                                                  int32_t h,
+                                                  int32_t count) {
+  int32_t       c, y;
+  const uint8_t *src;
+  uint8_t       *dst;
+  uint8_t       *cm = vp9_ff_cropTbl;
+  uint32_t      vector_64 = 64;
+  int32_t       Temp1, Temp2, Temp3;
+  uint32_t      qload1, qload2;
+  uint32_t      p1, p2, p3, p4, p5;
+  uint32_t      st1, st2, st3;
+  uint32_t      dst_pitch_2 = (dst_stride << 1);
+  uint8_t       *odd_dst;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],        0(%[src])                       \n\t"
+          "ulw              %[qload2],        4(%[src])                       \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "ulw              %[qload1],        8(%[src])                       \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
+          "ulw              %[qload2],        12(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        20(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       \n\t"
+          "ulw              %[qload2],        5(%[src])                       \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        9(%[src])                       \n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        21(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm),
+            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+      );
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += 1;
+  }
+}
+
+static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst_ptr,
+                                                  int32_t dst_stride,
+                                                  const int16_t *filter_x0,
+                                                  int32_t h) {
+  int32_t       c, y;
+  const uint8_t *src;
+  uint8_t       *dst;
+  uint8_t       *cm = vp9_ff_cropTbl;
+  uint32_t      vector_64 = 64;
+  int32_t       Temp1, Temp2, Temp3;
+  uint32_t      qload1, qload2;
+  uint32_t      p1, p2, p3, p4, p5;
+  uint32_t      st1, st2, st3;
+  uint32_t      dst_pitch_2 = (dst_stride << 1);
+  uint8_t       *odd_dst;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_load(src_ptr + src_stride + 64);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],        0(%[src])                       \n\t"
+          "ulw              %[qload2],        4(%[src])                       \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "ulw              %[qload1],        8(%[src])                       \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
+          "ulw              %[qload2],        12(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        20(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       \n\t"
+          "ulw              %[qload2],        5(%[src])                       \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        9(%[src])                       \n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        21(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm),
+            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+      );
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += 1;
+  }
+}
+
+void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter, int w, int h) {
+  int x, y;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int sum = 0;
+
+      sum += src[x] * filter[3];
+      sum += src[x + 1] * filter[4];
+
+      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter,
+                         int w, int h) {
+  uint32_t pos = 38;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  /* prefetch data to cache memory */
+  vp9_prefetch_load(src);
+  vp9_prefetch_load(src + 32);
+
+  switch (w) {
+    case 4:
+      convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
+                                           dst, dst_stride,
+                                           filter, h);
+      break;
+    case 8:
+      convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
+                                           dst, dst_stride,
+                                           filter, h);
+      break;
+    case 16:
+    case 32:
+      convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
+                                            dst, dst_stride,
+                                            filter, h,
+                                            (w/16));
+      break;
+    case 64:
+      vp9_prefetch_load(src + 32);
+      convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
+                                            dst, dst_stride,
+                                            filter, h);
+      break;
+    default:
+      convolve_bi_horiz_transposed(src, src_stride,
+                                   dst, dst_stride,
+                                   filter, w, h);
+      break;
+  }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
new file mode 100644
index 0000000..1debdb4
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
@@ -0,0 +1,713 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int16_t *filter_x0,
+                                      int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  int32_t Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp4],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],      0(%[dst])                      \n\t"
+        "sb               %[p1],       1(%[dst])                      \n\t"
+        "sb               %[tp2],      2(%[dst])                      \n\t"
+        "sb               %[p2],       3(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int16_t *filter_x0,
+                                      int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3;
+  uint32_t p1, p2, p3, p4;
+  uint32_t st0, st1;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "ulw              %[tp3],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac1,           31             \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "sb               %[st0],      0(%[dst])                      \n\t"
+        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
+
+        "balign           %[tp3],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+
+        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "sb               %[st1],      2(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
+        "sb               %[st0],      4(%[dst])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac1,           31             \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[st1],      1(%[dst])                      \n\t"
+        "sb               %[st0],      6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[p4],       3(%[dst])                      \n\t"
+        "sb               %[p2],       5(%[dst])                      \n\t"
+        "sb               %[p1],       7(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+          [st0] "=&r" (st0), [st1] "=&r" (st1),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       uint8_t *dst_ptr,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0,
+                                       int32_t h,
+                                       int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       uint8_t *dst_ptr,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0,
+                                       int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_load(src_ptr + src_stride + 64);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  if (16 == x_step_q4) {
+    uint32_t pos = 38;
+
+    vp9_prefetch_load((const uint8_t *)filter_x);
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src);
+    vp9_prefetch_load(src + 32);
+    vp9_prefetch_store(dst);
+
+    switch (w) {
+      case 4:
+        convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filter_x, (int32_t)h);
+        break;
+      case 8:
+        convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filter_x, (int32_t)h);
+        break;
+      case 16:
+        convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+                                   dst, (int32_t)dst_stride,
+                                   filter_x, (int32_t)h, 1);
+        break;
+      case 32:
+        convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+                                   dst, (int32_t)dst_stride,
+                                   filter_x, (int32_t)h, 2);
+        break;
+      case 64:
+        vp9_prefetch_load(src + 64);
+        vp9_prefetch_store(dst + 32);
+
+        convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
+                                   dst, (int32_t)dst_stride,
+                                   filter_x, (int32_t)h);
+        break;
+      default:
+        vp9_convolve8_horiz_c(src, src_stride,
+                              dst, dst_stride,
+                              filter_x, x_step_q4,
+                              filter_y, y_step_q4,
+                              w, h);
+        break;
+    }
+  } else {
+    vp9_convolve8_horiz_c(src, src_stride,
+                          dst, dst_stride,
+                          filter_x, x_step_q4,
+                          filter_y, y_step_q4,
+                          w, h);
+  }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
new file mode 100644
index 0000000..bf01f11
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
@@ -0,0 +1,266 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_vert_4_dspr2(const uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int16_t *filter_y,
+                                     int32_t w,
+                                     int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vp9_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2;
+  uint32_t      p1, p2;
+  uint32_t      scratch1;
+  uint32_t      store1, store2;
+  int32_t       Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [scratch1] "=&r" (scratch1),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride),
+            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_vert_64_dspr2(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int16_t *filter_y,
+                                      int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vp9_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2;
+  uint32_t      p1, p2;
+  uint32_t      scratch1;
+  uint32_t      store1, store2;
+  int32_t       Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [scratch1] "=&r" (scratch1),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride),
+            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (16 == y_step_q4) {
+    uint32_t pos = 38;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
+
+    vp9_prefetch_store(dst);
+
+    switch (w) {
+      case 4 :
+      case 8 :
+      case 16 :
+      case 32 :
+        convolve_bi_vert_4_dspr2(src, src_stride,
+                                 dst, dst_stride,
+                                 filter_y, w, h);
+        break;
+      case 64 :
+        vp9_prefetch_store(dst + 32);
+        convolve_bi_vert_64_dspr2(src, src_stride,
+                                  dst, dst_stride,
+                                  filter_y, h);
+        break;
+      default:
+        vp9_convolve8_vert_c(src, src_stride,
+                             dst, dst_stride,
+                             filter_x, x_step_q4,
+                             filter_y, y_step_q4,
+                             w, h);
+        break;
+    }
+  } else {
+    vp9_convolve8_vert_c(src, src_stride,
+                         dst, dst_stride,
+                         filter_x, x_step_q4,
+                         filter_y, y_step_q4,
+                         w, h);
+  }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
new file mode 100644
index 0000000..ab18490
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
@@ -0,0 +1,695 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_avg_vert_4_dspr2(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int16_t *filter_y,
+                                      int32_t w,
+                                      int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vp9_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2, load3, load4;
+  uint32_t      p1, p2;
+  uint32_t      n1, n2;
+  uint32_t      scratch1, scratch2;
+  uint32_t      store1, store2;
+  int32_t       vector1b, vector2b, vector3b, vector4b;
+  int32_t       Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [load3] "=&r" (load3), [load4] "=&r" (load4),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+            [vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_avg_vert_64_dspr2(const uint8_t *src,
+                                       int32_t src_stride,
+                                       uint8_t *dst,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_y,
+                                       int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vp9_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2, load3, load4;
+  uint32_t      p1, p2;
+  uint32_t      n1, n2;
+  uint32_t      scratch1, scratch2;
+  uint32_t      store1, store2;
+  int32_t       vector1b, vector2b, vector3b, vector4b;
+  int32_t       Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+    vp9_prefetch_store(dst + dst_stride + 32);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [load3] "=&r" (load3), [load4] "=&r" (load4),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+            [vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h) {
+  if (((const int32_t *)filter_y)[1] == 0x800000) {
+    vp9_convolve_avg(src, src_stride,
+                     dst, dst_stride,
+                     filter_x, x_step_q4,
+                     filter_y, y_step_q4,
+                     w, h);
+  } else if (((const int32_t *)filter_y)[0] == 0) {
+    vp9_convolve2_avg_vert_dspr2(src, src_stride,
+                                 dst, dst_stride,
+                                 filter_x, x_step_q4,
+                                 filter_y, y_step_q4,
+                                 w, h);
+  } else {
+    if (16 == y_step_q4) {
+      uint32_t pos = 38;
+
+      /* bit positon for extract from acc */
+      __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+      );
+
+      vp9_prefetch_store(dst);
+
+      switch (w) {
+        case 4:
+        case 8:
+        case 16:
+        case 32:
+          convolve_avg_vert_4_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_y, w, h);
+          break;
+        case 64:
+          vp9_prefetch_store(dst + 32);
+          convolve_avg_vert_64_dspr2(src, src_stride,
+                                     dst, dst_stride,
+                                     filter_y, h);
+          break;
+        default:
+          vp9_convolve8_avg_vert_c(src, src_stride,
+                                   dst, dst_stride,
+                                   filter_x, x_step_q4,
+                                   filter_y, y_step_q4,
+                                   w, h);
+          break;
+      }
+    } else {
+      vp9_convolve8_avg_vert_c(src, src_stride,
+                               dst, dst_stride,
+                               filter_x, x_step_q4,
+                               filter_y, y_step_q4,
+                               w, h);
+    }
+  }
+}
+
+void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4,
+                             int w, int h) {
+  /* Fixed size intermediate buffer places limits on parameters. */
+  DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
+  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
+
+  assert(w <= 64);
+  assert(h <= 64);
+
+  if (intermediate_height < h)
+    intermediate_height = h;
+
+  if (x_step_q4 != 16 || y_step_q4 != 16)
+    return vp9_convolve8_avg_c(src, src_stride,
+                               dst, dst_stride,
+                               filter_x, x_step_q4,
+                               filter_y, y_step_q4,
+                               w, h);
+
+  vp9_convolve8_horiz(src - (src_stride * 3), src_stride,
+                      temp, 64,
+                      filter_x, x_step_q4,
+                      filter_y, y_step_q4,
+                      w, intermediate_height);
+
+  vp9_convolve8_avg_vert(temp + 64 * 3, 64,
+                         dst, dst_stride,
+                         filter_x, x_step_q4,
+                         filter_y, y_step_q4,
+                         w, h);
+}
+
+void vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int filter_x_stride,
+                            const int16_t *filter_y, int filter_y_stride,
+                            int w, int h) {
+  int x, y;
+  uint32_t tp1, tp2, tn1;
+  uint32_t tp3, tp4, tn2;
+
+  /* prefetch data to cache memory */
+  vp9_prefetch_load(src);
+  vp9_prefetch_load(src + 32);
+  vp9_prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+      /* 1 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+
+            : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
+              [tp2] "=&r" (tp2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    case 8:
+      /* 2 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "ulw              %[tp3],         4(%[src])      \n\t"
+            "ulw              %[tp4],         4(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    case 16:
+      /* 4 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "ulw              %[tp3],         4(%[src])      \n\t"
+            "ulw              %[tp4],         4(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         8(%[src])      \n\t"
+            "ulw              %[tp2],         8(%[dst])      \n\t"
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+            "ulw              %[tp3],         12(%[src])     \n\t"
+            "ulw              %[tp4],         12(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    case 32:
+      /* 8 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "ulw              %[tp3],         4(%[src])      \n\t"
+            "ulw              %[tp4],         4(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         8(%[src])      \n\t"
+            "ulw              %[tp2],         8(%[dst])      \n\t"
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+            "ulw              %[tp3],         12(%[src])     \n\t"
+            "ulw              %[tp4],         12(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         16(%[src])     \n\t"
+            "ulw              %[tp2],         16(%[dst])     \n\t"
+            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         20(%[src])     \n\t"
+            "ulw              %[tp4],         20(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         24(%[src])     \n\t"
+            "ulw              %[tp2],         24(%[dst])     \n\t"
+            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         28(%[src])     \n\t"
+            "ulw              %[tp4],         28(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    case 64:
+      vp9_prefetch_load(src + 64);
+      vp9_prefetch_store(dst + 32);
+
+      /* 16 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_load(src + src_stride + 64);
+        vp9_prefetch_store(dst + dst_stride);
+        vp9_prefetch_store(dst + dst_stride + 32);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "ulw              %[tp3],         4(%[src])      \n\t"
+            "ulw              %[tp4],         4(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         8(%[src])      \n\t"
+            "ulw              %[tp2],         8(%[dst])      \n\t"
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+            "ulw              %[tp3],         12(%[src])     \n\t"
+            "ulw              %[tp4],         12(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         16(%[src])     \n\t"
+            "ulw              %[tp2],         16(%[dst])     \n\t"
+            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         20(%[src])     \n\t"
+            "ulw              %[tp4],         20(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         24(%[src])     \n\t"
+            "ulw              %[tp2],         24(%[dst])     \n\t"
+            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         28(%[src])     \n\t"
+            "ulw              %[tp4],         28(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         32(%[src])     \n\t"
+            "ulw              %[tp2],         32(%[dst])     \n\t"
+            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         36(%[src])     \n\t"
+            "ulw              %[tp4],         36(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         40(%[src])     \n\t"
+            "ulw              %[tp2],         40(%[dst])     \n\t"
+            "sw               %[tn1],         32(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         36(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         44(%[src])     \n\t"
+            "ulw              %[tp4],         44(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         48(%[src])     \n\t"
+            "ulw              %[tp2],         48(%[dst])     \n\t"
+            "sw               %[tn1],         40(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         44(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         52(%[src])     \n\t"
+            "ulw              %[tp4],         52(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         56(%[src])     \n\t"
+            "ulw              %[tp2],         56(%[dst])     \n\t"
+            "sw               %[tn1],         48(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         52(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         60(%[src])     \n\t"
+            "ulw              %[tp4],         60(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         56(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         60(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    default:
+      for (y = h; y > 0; --y) {
+        for (x = 0; x < w; ++x) {
+          dst[x] = (dst[x] + src[x] + 1) >> 1;
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+  }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
new file mode 100644
index 0000000..69da1cf
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
@@ -0,0 +1,1038 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
+                                       int32_t src_stride,
+                                       uint8_t *dst,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0,
+                                       int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  int32_t  vector1b, vector2b, vector3b, vector4b;
+  int32_t  Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4;
+  uint32_t n1, n2, n3, n4;
+  uint32_t tn1, tn2;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "ulw              %[tn2],         8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
+        "balign           %[tn1],         %[tn2],         3              \n\t"
+        "balign           %[tn2],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
+        "preceu.ph.qbr    %[n1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[n2],          %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[n3],          %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n4],          %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[n1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[n2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[n3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[n4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[n1],          %[tn1]                         \n\t"
+        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
+        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
+        "dpa.w.ph         $ac2,           %[n2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp4],       $ac2,           31             \n\t"
+
+        "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
+        "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
+
+        /* clamp */
+        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
+        "lbux             %[n2],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
+        "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
+
+        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
+        "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
+
+        "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t"  /* average odd 2 */
+        "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
+                                       int32_t src_stride,
+                                       uint8_t *dst,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0,
+                                       int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4, n1;
+  uint32_t tn1, tn2, tn3;
+  uint32_t st0, st1;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
+        "ulw              %[tn2],         8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+        "lbu              %[Temp2],       0(%[dst])                      \n\t"
+        "lbu              %[tn3],         2(%[dst])                      \n\t"
+
+        /* even 2. pixel */
+        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n1],          %[tn2]                         \n\t"
+        "ulw              %[tn1],         12(%[src])                     \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac1                           \n\t"
+        "mthi             $zero,          $ac1                           \n\t"
+        "preceu.ph.qbr    %[p2],          %[tn1]                         \n\t"
+        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac1,           31             \n\t"
+
+        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
+        "addqh_r.w        %[tn3],         %[tn3],         %[st1]         \n\t"
+        "sb               %[Temp2],       0(%[dst])                      \n\t"
+        "sb               %[tn3],         2(%[dst])                      \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+
+        "balign           %[tn3],         %[tn1],         3              \n\t"
+        "balign           %[tn1],         %[tn2],         3              \n\t"
+        "balign           %[tn2],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+
+        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
+        "lbu              %[Temp2],       4(%[dst])                      \n\t"
+        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
+
+        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a],    $ac1                           \n\t"
+        "mthi             $zero,          $ac1                           \n\t"
+        "sb               %[Temp2],       4(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tp1],         6(%[dst])                      \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
+        "preceu.ph.qbl    %[n1],          %[tn1]                         \n\t"
+        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac1,           31             \n\t"
+
+        "lbu              %[tp2],         1(%[dst])                      \n\t"
+        "lbu              %[tn2],         3(%[dst])                      \n\t"
+        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
+        "preceu.ph.qbr    %[p2],          %[tn3]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]    \n\t"
+        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tn3],         5(%[dst])                      \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[tp2],         1(%[dst])                      \n\t"
+        "sb               %[tp1],         6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac2,           31             \n\t"
+
+        "lbu              %[tn1],         7(%[dst])                      \n\t"
+
+        /* clamp */
+        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
+        "addqh_r.w        %[tn2],         %[tn2],         %[p4]          \n\t"
+
+        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
+        "addqh_r.w        %[tn3],         %[tn3],         %[p2]          \n\t"
+
+        "lbux             %[n1],          %[Temp1](%[cm])                \n\t"
+        "addqh_r.w        %[tn1],         %[tn1],         %[n1]          \n\t"
+
+        /* store bytes */
+        "sb               %[tn2],         3(%[dst])                      \n\t"
+        "sb               %[tn3],         5(%[dst])                      \n\t"
+        "sb               %[tn1],         7(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
+          [st0] "=&r" (st0), [st1] "=&r" (st1),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        uint8_t *dst_ptr,
+                                        int32_t dst_stride,
+                                        const int16_t *filter_x0,
+                                        int32_t h,
+                                        int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
+          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                   \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
+          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
+          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
+
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
+
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
+
+          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
+          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
+          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        uint8_t *dst_ptr,
+                                        int32_t dst_stride,
+                                        const int16_t *filter_x0,
+                                        int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_load(src_ptr + src_stride + 64);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
+          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                   \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
+          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
+          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
+
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
+
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
+
+          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
+          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
+          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vp9_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h) {
+  if (((const int32_t *)filter_x)[1] == 0x800000) {
+    vp9_convolve_avg(src, src_stride,
+                     dst, dst_stride,
+                     filter_x, x_step_q4,
+                     filter_y, y_step_q4,
+                     w, h);
+  } else if (((const int32_t *)filter_x)[0] == 0) {
+    vp9_convolve2_avg_horiz_dspr2(src, src_stride,
+                                  dst, dst_stride,
+                                  filter_x, x_step_q4,
+                                  filter_y, y_step_q4,
+                                  w, h);
+  } else {
+    if (16 == x_step_q4) {
+      uint32_t pos = 38;
+
+      src -= 3;
+
+      /* bit positon for extract from acc */
+      __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+      );
+
+      /* prefetch data to cache memory */
+      vp9_prefetch_load(src);
+      vp9_prefetch_load(src + 32);
+      vp9_prefetch_store(dst);
+
+      switch (w) {
+        case 4:
+          convolve_avg_horiz_4_dspr2(src, src_stride,
+                                     dst, dst_stride,
+                                     filter_x, h);
+          break;
+        case 8:
+          convolve_avg_horiz_8_dspr2(src, src_stride,
+                                     dst, dst_stride,
+                                     filter_x, h);
+          break;
+        case 16:
+          convolve_avg_horiz_16_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_x, h, 1);
+          break;
+        case 32:
+          convolve_avg_horiz_16_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_x, h, 2);
+          break;
+        case 64:
+          vp9_prefetch_load(src + 64);
+          vp9_prefetch_store(dst + 32);
+
+          convolve_avg_horiz_64_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_x, h);
+          break;
+        default:
+          vp9_convolve8_avg_horiz_c(src + 3, src_stride,
+                                    dst, dst_stride,
+                                    filter_x, x_step_q4,
+                                    filter_y, y_step_q4,
+                                    w, h);
+          break;
+      }
+    } else {
+      vp9_convolve8_avg_horiz_c(src, src_stride,
+                                dst, dst_stride,
+                                filter_x, x_step_q4,
+                                filter_y, y_step_q4,
+                                w, h);
+    }
+  }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
new file mode 100644
index 0000000..0ef9dd5
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
@@ -0,0 +1,1284 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+uint8_t vp9_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
+uint8_t *vp9_ff_cropTbl;
+
+void vp9_dsputil_static_init(void) {
+  int i;
+
+  for (i = 0; i < 256; i++) vp9_ff_cropTbl_a[i + CROP_WIDTH] = i;
+
+  for (i = 0; i < CROP_WIDTH; i++) {
+    vp9_ff_cropTbl_a[i] = 0;
+    vp9_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
+  }
+
+  vp9_ff_cropTbl = &vp9_ff_cropTbl_a[CROP_WIDTH];
+}
+
+static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int16_t *filter_x0,
+                                              int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *dst_ptr;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4;
+  uint32_t tn1, tn2;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    dst_ptr = dst;
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "ulw              %[tn2],         8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
+        "balign           %[tn1],         %[tn2],         3              \n\t"
+        "balign           %[tn2],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp4],       $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"
+        "lbux             %[p2],          %[Temp4](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[tn1],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+          [dst_ptr] "+r" (dst_ptr)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int16_t *filter_x0,
+                                              int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *dst_ptr;
+  uint32_t vector4a = 64;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3;
+  uint32_t p1, p2, p3, p4, n1;
+  uint8_t *odd_dst;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+
+    dst_ptr = dst;
+    odd_dst = (dst_ptr + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp2],         0(%[src])                       \n\t"
+        "ulw              %[tp1],         4(%[src])                       \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp1]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp1]                          \n\t"
+        "ulw              %[tp3],         8(%[src])                       \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
+        "extp             %[Temp1],       $ac3,           31              \n\t"
+
+        /* even 2. pixel */
+        "preceu.ph.qbr    %[p1],          %[tp3]                          \n\t"
+        "preceu.ph.qbl    %[n1],          %[tp3]                          \n\t"
+        "ulw              %[tp2],         12(%[src])                      \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "preceu.ph.qbr    %[p2],          %[tp2]                          \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]     \n\t"
+        "lbux             %[tp3],         %[Temp3](%[cm])                 \n\t"
+        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]     \n\t"
+        "extp             %[p3],          $ac1,           31              \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "sb               %[tp3],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+
+        "ulw              %[tp1],         1(%[src])                       \n\t"
+        "ulw              %[tp3],         5(%[src])                       \n\t"
+
+        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        "lbux             %[tp2],         %[p3](%[cm])                    \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
+        "sb               %[tp2],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "ulw              %[tp2],         9(%[src])                       \n\t"
+
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[n1],          %[tp2]                          \n\t"
+        "ulw              %[Temp1],       13(%[src])                      \n\t"
+        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]     \n\t"
+        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]     \n\t"
+        "extp             %[Temp3],       $ac1,           31              \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
+        "preceu.ph.qbr    %[p2],          %[Temp1]                        \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
+        "extp             %[Temp1],       $ac2,           31              \n\t"
+
+        /* clamp */
+        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
+        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
+        "lbux             %[n1],          %[Temp1](%[cm])                 \n\t"
+
+        /* store bytes */
+        "sb               %[p4],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[p2],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[n1],          0(%[odd_dst])                   \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a), [cm] "r" (cm),
+          [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
+                                               int32_t src_stride,
+                                               uint8_t *dst_ptr,
+                                               int32_t dst_stride,
+                                               const int16_t *filter_x0,
+                                               int32_t h,
+                                               int32_t count) {
+  int32_t c, y;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t  filter12, filter34, filter56, filter78;
+  int32_t  Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  uint8_t  *odd_dst;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],        0(%[src])                       \n\t"
+          "ulw              %[qload2],        4(%[src])                       \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "ulw              %[qload2],        8(%[src])                       \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "ulw              %[qload1],        12(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        16(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        20(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       \n\t"
+          "ulw              %[qload2],        5(%[src])                       \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        9(%[src])                       \n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        17(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        21(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64), [cm] "r" (cm),
+            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+      );
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+
+    dst_ptr += 1;
+  }
+}
+
+static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
+                                               int32_t src_stride,
+                                               uint8_t *dst_ptr,
+                                               int32_t dst_stride,
+                                               const int16_t *filter_x0,
+                                               int32_t h) {
+  int32_t c, y;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t  filter12, filter34, filter56, filter78;
+  int32_t  Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  uint8_t  *odd_dst;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_load(src_ptr + src_stride + 64);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],        0(%[src])                       \n\t"
+          "ulw              %[qload2],        4(%[src])                       \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "ulw              %[qload2],        8(%[src])                       \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "ulw              %[qload1],        12(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        16(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        20(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       \n\t"
+          "ulw              %[qload2],        5(%[src])                       \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        9(%[src])                       \n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        17(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        21(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64), [cm] "r" (cm),
+            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+      );
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+
+    dst_ptr += 1;
+  }
+}
+
+void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter, int w, int h) {
+  int x, y, k;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int sum = 0;
+
+      for (k = 0; k < 8; ++k)
+        sum += src[x + k] * filter[k];
+
+      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           int w, int h) {
+  int x, y;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      dst[x * dst_stride] = src[x];
+    }
+
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
+  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
+  uint32_t pos = 38;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  if (intermediate_height < h)
+    intermediate_height = h;
+
+  if (x_step_q4 != 16 || y_step_q4 != 16)
+    return vp9_convolve8_c(src, src_stride,
+                           dst, dst_stride,
+                           filter_x, x_step_q4,
+                           filter_y, y_step_q4,
+                           w, h);
+
+  if ((((const int32_t *)filter_x)[1] == 0x800000)
+      && (((const int32_t *)filter_y)[1] == 0x800000))
+    return vp9_convolve_copy(src, src_stride,
+                             dst, dst_stride,
+                             filter_x, x_step_q4,
+                             filter_y, y_step_q4,
+                             w, h);
+
+  /* copy the src to dst */
+  if (filter_x[3] == 0x80) {
+    copy_horiz_transposed(src - src_stride * 3, src_stride,
+                          temp, intermediate_height,
+                          w, intermediate_height);
+  } else if (((const int32_t *)filter_x)[0] == 0) {
+    vp9_convolve2_dspr2(src - src_stride * 3, src_stride,
+                        temp, intermediate_height,
+                        filter_x,
+                        w, intermediate_height);
+  } else {
+    src -= (src_stride * 3 + 3);
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src);
+    vp9_prefetch_load(src + 32);
+
+    switch (w) {
+      case 4:
+        convolve_horiz_4_transposed_dspr2(src, src_stride,
+                                          temp, intermediate_height,
+                                          filter_x, intermediate_height);
+        break;
+      case 8:
+        convolve_horiz_8_transposed_dspr2(src, src_stride,
+                                          temp, intermediate_height,
+                                          filter_x, intermediate_height);
+        break;
+      case 16:
+      case 32:
+        convolve_horiz_16_transposed_dspr2(src, src_stride,
+                                           temp, intermediate_height,
+                                           filter_x, intermediate_height,
+                                           (w/16));
+        break;
+      case 64:
+        vp9_prefetch_load(src + 32);
+        convolve_horiz_64_transposed_dspr2(src, src_stride,
+                                           temp, intermediate_height,
+                                           filter_x, intermediate_height);
+        break;
+      default:
+        convolve_horiz_transposed(src, src_stride,
+                                  temp, intermediate_height,
+                                  filter_x, w, intermediate_height);
+        break;
+    }
+  }
+
+  /* copy the src to dst */
+  if (filter_y[3] == 0x80) {
+    copy_horiz_transposed(temp + 3, intermediate_height,
+                          dst, dst_stride,
+                          h, w);
+  } else if (((const int32_t *)filter_y)[0] == 0) {
+    vp9_convolve2_dspr2(temp + 3, intermediate_height,
+                        dst, dst_stride,
+                        filter_y,
+                        h, w);
+  } else {
+    switch (h) {
+      case 4:
+        convolve_horiz_4_transposed_dspr2(temp, intermediate_height,
+                                          dst, dst_stride,
+                                          filter_y, w);
+        break;
+      case 8:
+        convolve_horiz_8_transposed_dspr2(temp, intermediate_height,
+                                          dst, dst_stride,
+                                          filter_y, w);
+        break;
+      case 16:
+      case 32:
+        convolve_horiz_16_transposed_dspr2(temp, intermediate_height,
+                                           dst, dst_stride,
+                                           filter_y, w, (h/16));
+        break;
+      case 64:
+        convolve_horiz_64_transposed_dspr2(temp, intermediate_height,
+                                           dst, dst_stride,
+                                           filter_y, w);
+        break;
+      default:
+        convolve_horiz_transposed(temp, intermediate_height,
+                                  dst, dst_stride,
+                                  filter_y, h, w);
+        break;
+    }
+  }
+}
+
+void vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int filter_x_stride,
+                             const int16_t *filter_y, int filter_y_stride,
+                             int w, int h) {
+  int x, y;
+
+  /* prefetch data to cache memory */
+  vp9_prefetch_load(src);
+  vp9_prefetch_load(src + 32);
+  vp9_prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+      {
+      uint32_t tp1;
+
+      /* 1 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         (%[src])      \n\t"
+            "sw               %[tp1],         (%[dst])      \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    case 8:
+      {
+      uint32_t tp1, tp2;
+
+      /* 2 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    case 16:
+      {
+      uint32_t tp1, tp2, tp3, tp4;
+
+      /* 4 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    case 32:
+      {
+      uint32_t tp1, tp2, tp3, tp4;
+      uint32_t tp5, tp6, tp7, tp8;
+
+      /* 8 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+            "ulw              %[tp5],         16(%[src])     \n\t"
+            "ulw              %[tp6],         20(%[src])     \n\t"
+            "ulw              %[tp7],         24(%[src])     \n\t"
+            "ulw              %[tp8],         28(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
+            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
+            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
+            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
+            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
+              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    case 64:
+      {
+      uint32_t tp1, tp2, tp3, tp4;
+      uint32_t tp5, tp6, tp7, tp8;
+
+      vp9_prefetch_load(src + 64);
+      vp9_prefetch_store(dst + 32);
+
+      /* 16 word storage */
+      for (y = h; y--; ) {
+        vp9_prefetch_load(src + src_stride);
+        vp9_prefetch_load(src + src_stride + 32);
+        vp9_prefetch_load(src + src_stride + 64);
+        vp9_prefetch_store(dst + dst_stride);
+        vp9_prefetch_store(dst + dst_stride + 32);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+            "ulw              %[tp5],         16(%[src])     \n\t"
+            "ulw              %[tp6],         20(%[src])     \n\t"
+            "ulw              %[tp7],         24(%[src])     \n\t"
+            "ulw              %[tp8],         28(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
+            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
+            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
+            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
+            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
+
+            "ulw              %[tp1],         32(%[src])     \n\t"
+            "ulw              %[tp2],         36(%[src])     \n\t"
+            "ulw              %[tp3],         40(%[src])     \n\t"
+            "ulw              %[tp4],         44(%[src])     \n\t"
+            "ulw              %[tp5],         48(%[src])     \n\t"
+            "ulw              %[tp6],         52(%[src])     \n\t"
+            "ulw              %[tp7],         56(%[src])     \n\t"
+            "ulw              %[tp8],         60(%[src])     \n\t"
+
+            "sw               %[tp1],         32(%[dst])     \n\t"  /* store */
+            "sw               %[tp2],         36(%[dst])     \n\t"  /* store */
+            "sw               %[tp3],         40(%[dst])     \n\t"  /* store */
+            "sw               %[tp4],         44(%[dst])     \n\t"  /* store */
+            "sw               %[tp5],         48(%[dst])     \n\t"  /* store */
+            "sw               %[tp6],         52(%[dst])     \n\t"  /* store */
+            "sw               %[tp7],         56(%[dst])     \n\t"  /* store */
+            "sw               %[tp8],         60(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
+              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    default:
+      for (y = h; y--; ) {
+        for (x = 0; x < w; ++x) {
+          dst[x] = src[x];
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+  }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
new file mode 100644
index 0000000..0303896
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
@@ -0,0 +1,923 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_horiz_4_dspr2(const uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int16_t *filter_x0,
+                                   int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4;
+  uint32_t n1, n2, n3, n4;
+  uint32_t tn1, tn2;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "ulw              %[tn2],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
+        "balign           %[tn1],      %[tn2],         3              \n\t"
+        "balign           %[tn2],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
+        "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp4],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
+        "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],      0(%[dst])                      \n\t"
+        "sb               %[tn1],      1(%[dst])                      \n\t"
+        "sb               %[tp2],      2(%[dst])                      \n\t"
+        "sb               %[n2],       3(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_horiz_8_dspr2(const uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int16_t *filter_x0,
+                                   int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4, n1;
+  uint32_t tn1, tn2, tn3;
+  uint32_t st0, st1;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "ulw              %[tn2],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
+        "ulw              %[tn1],      12(%[src])                     \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac1,           31             \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "sb               %[st0],      0(%[dst])                      \n\t"
+        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
+
+        "balign           %[tn3],      %[tn1],         3              \n\t"
+        "balign           %[tn1],      %[tn2],         3              \n\t"
+        "balign           %[tn2],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+
+        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "sb               %[st1],      2(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
+        "sb               %[st0],      4(%[dst])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
+        "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
+        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac1,           31             \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
+        "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[st1],      1(%[dst])                      \n\t"
+        "sb               %[st0],      6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[p4],       3(%[dst])                      \n\t"
+        "sb               %[p2],       5(%[dst])                      \n\t"
+        "sb               %[n1],       7(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
+          [st0] "=&r" (st0), [st1] "=&r" (st1),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
+                                    int32_t src_stride,
+                                    uint8_t *dst_ptr,
+                                    int32_t dst_stride,
+                                    const int16_t *filter_x0,
+                                    int32_t h,
+                                    int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst),
+            [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
+                                    int32_t src_stride,
+                                    uint8_t *dst_ptr,
+                                    int32_t dst_stride,
+                                    const int16_t *filter_x0,
+                                    int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_load(src_ptr + src_stride + 64);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst),
+            [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vp9_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  if (((const int32_t *)filter_x)[1] == 0x800000) {
+    vp9_convolve_copy(src, src_stride,
+                      dst, dst_stride,
+                      filter_x, x_step_q4,
+                      filter_y, y_step_q4,
+                      w, h);
+  } else if (((const int32_t *)filter_x)[0] == 0) {
+    vp9_convolve2_horiz_dspr2(src, src_stride,
+                              dst, dst_stride,
+                              filter_x, x_step_q4,
+                              filter_y, y_step_q4,
+                              w, h);
+  } else {
+    if (16 == x_step_q4) {
+      uint32_t pos = 38;
+
+      vp9_prefetch_load((const uint8_t *)filter_x);
+      src -= 3;
+
+      /* bit positon for extract from acc */
+      __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+      );
+
+      /* prefetch data to cache memory */
+      vp9_prefetch_load(src);
+      vp9_prefetch_load(src + 32);
+      vp9_prefetch_store(dst);
+
+      switch (w) {
+        case 4:
+          convolve_horiz_4_dspr2(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filter_x, (int32_t)h);
+          break;
+        case 8:
+          convolve_horiz_8_dspr2(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filter_x, (int32_t)h);
+          break;
+        case 16:
+          convolve_horiz_16_dspr2(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filter_x, (int32_t)h, 1);
+          break;
+        case 32:
+          convolve_horiz_16_dspr2(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filter_x, (int32_t)h, 2);
+          break;
+        case 64:
+          vp9_prefetch_load(src + 64);
+          vp9_prefetch_store(dst + 32);
+
+          convolve_horiz_64_dspr2(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filter_x, (int32_t)h);
+          break;
+        default:
+          vp9_convolve8_horiz_c(src + 3, src_stride,
+                                dst, dst_stride,
+                                filter_x, x_step_q4,
+                                filter_y, y_step_q4,
+                                w, h);
+          break;
+      }
+    } else {
+      vp9_convolve8_horiz_c(src, src_stride,
+                            dst, dst_stride,
+                            filter_x, x_step_q4,
+                            filter_y, y_step_q4,
+                            w, h);
+    }
+  }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
new file mode 100644
index 0000000..0930bb3
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
@@ -0,0 +1,396 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_vert_4_dspr2(const uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int16_t *filter_y,
+                                  int32_t w,
+                                  int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2, load3, load4;
+  uint32_t p1, p2;
+  uint32_t n1, n2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [load3] "=&r" (load3), [load4] "=&r" (load4),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [n1] "=&r" (n1), [n2] "=&r" (n2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+            [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
+            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert_64_dspr2(const uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int16_t *filter_y,
+                                   int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2, load3, load4;
+  uint32_t p1, p2;
+  uint32_t n1, n2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+    vp9_prefetch_store(dst + dst_stride + 32);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [load3] "=&r" (load3), [load4] "=&r" (load4),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [n1] "=&r" (n1), [n2] "=&r" (n2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+            [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
+            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (((const int32_t *)filter_y)[1] == 0x800000) {
+    vp9_convolve_copy(src, src_stride,
+                      dst, dst_stride,
+                      filter_x, x_step_q4,
+                      filter_y, y_step_q4,
+                      w, h);
+  } else if (((const int32_t *)filter_y)[0] == 0) {
+    vp9_convolve2_vert_dspr2(src, src_stride,
+                             dst, dst_stride,
+                             filter_x, x_step_q4,
+                             filter_y, y_step_q4,
+                             w, h);
+  } else {
+    if (16 == y_step_q4) {
+      uint32_t pos = 38;
+
+      /* bit positon for extract from acc */
+      __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+      );
+
+      vp9_prefetch_store(dst);
+
+      switch (w) {
+        case 4 :
+        case 8 :
+        case 16 :
+        case 32 :
+          convolve_vert_4_dspr2(src, src_stride,
+                                dst, dst_stride,
+                                filter_y, w, h);
+          break;
+        case 64 :
+          vp9_prefetch_store(dst + 32);
+          convolve_vert_64_dspr2(src, src_stride,
+                                 dst, dst_stride,
+                                 filter_y, h);
+          break;
+        default:
+          vp9_convolve8_vert_c(src, src_stride,
+                               dst, dst_stride,
+                               filter_x, x_step_q4,
+                               filter_y, y_step_q4,
+                               w, h);
+          break;
+      }
+    } else {
+      vp9_convolve8_vert_c(src, src_stride,
+                           dst, dst_stride,
+                           filter_x, x_step_q4,
+                           filter_y, y_step_q4,
+                           w, h);
+    }
+  }
+}
+
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
new file mode 100644
index 0000000..1b2f550
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
@@ -0,0 +1,1315 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output,
+                                 uint32_t no_rows) {
+  int i;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int step1_10, step1_11, step1_12, step1_13;
+  int step2_0, step2_1, step2_2, step2_3;
+  int step2_8, step2_9, step2_10, step2_11;
+  int step2_12, step2_13, step2_14, step2_15;
+  int load1, load2, load3, load4, load5, load6, load7, load8;
+  int result1, result2, result3, result4;
+  const int const_2_power_13 = 8192;
+
+  for (i = no_rows; i--; ) {
+    /* prefetch row */
+    vp9_prefetch_load((const uint8_t *)(input + 16));
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              0(%[input])                    \n\t"
+        "lh       %[load2],             16(%[input])                    \n\t"
+        "lh       %[load3],              8(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[step2_0],           $ac1,           31              \n\t"
+        "extp     %[step2_1],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[step2_3],           $ac1,           31              \n\t"
+
+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             2(%[input])                     \n\t"
+        "lh       %[load6],             30(%[input])                    \n\t"
+        "lh       %[load7],             18(%[input])                    \n\t"
+        "lh       %[load8],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"
+        "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "sub      %[load5],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[result4],     %[result3]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             22(%[input])                    \n\t"
+        "lh       %[load3],             26(%[input])                    \n\t"
+        "lh       %[load4],             6(%[input])                     \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[result2],     %[result1]      \n\t"
+        "sub      %[load2],             %[result4],     %[result3]      \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             4(%[input])                     \n\t"
+        "lh       %[load6],             28(%[input])                    \n\t"
+        "lh       %[load7],             20(%[input])                    \n\t"
+        "lh       %[load8],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"
+        "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[result4],     %[result3]      \n\t"
+        "sub      %[load5],             %[load5],       %[result1]      \n\t"
+        "add      %[load5],             %[load5],       %[result2]      \n\t"
+
+        "sub      %[load6],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[load6],       %[result3]      \n\t"
+        "add      %[load6],             %[load6],       %[result4]      \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+
+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_10],          $ac0,           31              \n\t"
+        "extp     %[step1_13],          $ac1,           31              \n\t"
+        "extp     %[step1_11],          $ac2,           31              \n\t"
+        "extp     %[step1_12],          $ac3,           31              \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_12]     \n\t"
+        "add      %[load5],             %[load5],       %[step2_15]     \n\t"
+        "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_13]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_14]     \n\t"
+        "sh       %[load5],             0(%[output])                    \n\t"
+        "sh       %[load6],             32(%[output])                   \n\t"
+        "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+        "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "sh       %[load5],             192(%[output])                  \n\t"
+        "sh       %[load6],             224(%[output])                  \n\t"
+        "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_11]     \n\t"
+        "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_9]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "sh       %[load5],             256(%[output])                  \n\t"
+        "sh       %[load6],             288(%[output])                  \n\t"
+        "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_14]     \n\t"
+        "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_15]     \n\t"
+        "sh       %[load5],             448(%[output])                  \n\t"
+        "sh       %[load6],             480(%[output])                  \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6)
+        : [output] "r" (output),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
+    );
+
+    __asm__ __volatile__ (
+        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
+        "add      %[load5],             %[load5],       %[step1_13]     \n\t"
+        "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
+        "add      %[load6],             %[load6],       %[step1_12]     \n\t"
+        "sh       %[load5],             64(%[output])                   \n\t"
+        "sh       %[load6],             96(%[output])                   \n\t"
+        "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"
+        "add      %[load5],             %[load5],       %[step1_11]     \n\t"
+        "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"
+        "add      %[load6],             %[load6],       %[step1_10]     \n\t"
+        "sh       %[load5],             128(%[output])                  \n\t"
+        "sh       %[load6],             160(%[output])                  \n\t"
+        "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"
+        "sub      %[load5],             %[load5],       %[step1_10]     \n\t"
+        "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"
+        "sub      %[load6],             %[load6],       %[step1_11]     \n\t"
+        "sh       %[load5],             320(%[output])                  \n\t"
+        "sh       %[load6],             352(%[output])                  \n\t"
+        "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"
+        "sub      %[load5],             %[load5],       %[step1_12]     \n\t"
+        "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"
+        "sub      %[load6],             %[load6],       %[step1_13]     \n\t"
+        "sh       %[load5],             384(%[output])                  \n\t"
+        "sh       %[load6],             416(%[output])                  \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6)
+        : [output] "r" (output),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
+    );
+
+    input += 16;
+    output += 1;
+  }
+}
+
+static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                         int dest_stride) {
+  int i;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int step1_8, step1_9, step1_10, step1_11;
+  int step1_12, step1_13, step1_14, step1_15;
+  int step2_0, step2_1, step2_2, step2_3;
+  int step2_8, step2_9, step2_10, step2_11;
+  int step2_12, step2_13, step2_14, step2_15;
+  int load1, load2, load3, load4, load5, load6, load7, load8;
+  int result1, result2, result3, result4;
+  const int const_2_power_13 = 8192;
+  uint8_t *dest_pix;
+  uint8_t *cm = vp9_ff_cropTbl;
+
+  /* prefetch vp9_ff_cropTbl */
+  vp9_prefetch_load(vp9_ff_cropTbl);
+  vp9_prefetch_load(vp9_ff_cropTbl +  32);
+  vp9_prefetch_load(vp9_ff_cropTbl +  64);
+  vp9_prefetch_load(vp9_ff_cropTbl +  96);
+  vp9_prefetch_load(vp9_ff_cropTbl + 128);
+  vp9_prefetch_load(vp9_ff_cropTbl + 160);
+  vp9_prefetch_load(vp9_ff_cropTbl + 192);
+  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+  for (i = 0; i < 16; ++i) {
+    dest_pix = (dest + i);
+    __asm__ __volatile__ (
+        "lh       %[load1],              0(%[input])                    \n\t"
+        "lh       %[load2],             16(%[input])                    \n\t"
+        "lh       %[load3],              8(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[step2_0],           $ac1,           31              \n\t"
+        "extp     %[step2_1],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[step2_3],           $ac1,           31              \n\t"
+
+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             2(%[input])                     \n\t"
+        "lh       %[load6],             30(%[input])                    \n\t"
+        "lh       %[load7],             18(%[input])                    \n\t"
+        "lh       %[load8],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"
+        "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"
+        "extp     %[result4],           $ac2,            31             \n\t"
+
+        "sub      %[load5],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[result4],     %[result3]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             22(%[input])                    \n\t"
+        "lh       %[load3],             26(%[input])                    \n\t"
+        "lh       %[load4],             6(%[input])                     \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"
+        "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"
+        "extp     %[result1],           $ac1,        31                 \n\t"
+
+        "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"
+        "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"
+        "extp     %[result2],           $ac3,        31                 \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"
+        "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"
+        "extp     %[result3],           $ac1,        31                 \n\t"
+
+        "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"
+        "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"
+        "extp     %[result4],           $ac2,        31                 \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[result2],     %[result1]      \n\t"
+        "sub      %[load2],             %[result4],     %[result3]      \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             4(%[input])                   \n\t"
+        "lh       %[load6],             28(%[input])                  \n\t"
+        "lh       %[load7],             20(%[input])                  \n\t"
+        "lh       %[load8],             12(%[input])                  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
+        "mthi     $zero,                $ac1                          \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                          \n\t"
+        "mthi     $zero,                $ac3                          \n\t"
+
+        "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"
+        "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"
+        "extp     %[result1],           $ac1,        31               \n\t"
+
+        "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"
+        "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"
+        "extp     %[result2],           $ac3,        31               \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
+        "mthi     $zero,                $ac1                          \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                          \n\t"
+        "mthi     $zero,                $ac2                          \n\t"
+
+        "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"
+        "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"
+        "extp     %[result3],           $ac1,        31               \n\t"
+
+        "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"
+        "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"
+        "extp     %[result4],           $ac2,        31               \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[result4],     %[result3]      \n\t"
+        "sub      %[load5],             %[load5],       %[result1]      \n\t"
+        "add      %[load5],             %[load5],       %[result2]      \n\t"
+
+        "sub      %[load6],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[load6],       %[result3]      \n\t"
+        "add      %[load6],             %[load6],       %[result4]      \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+
+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+
+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_10],          $ac0,           31              \n\t"
+        "extp     %[step1_13],          $ac1,           31              \n\t"
+        "extp     %[step1_11],          $ac2,           31              \n\t"
+        "extp     %[step1_12],          $ac3,           31              \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step1_8 = step2_8 + step2_11;
+    step1_9 = step2_9 + step2_10;
+    step1_14 = step2_13 + step2_14;
+    step1_15 = step2_12 + step2_15;
+
+    __asm__ __volatile__ (
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_15]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_14]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_13]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_12]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_11]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_10]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[load5],           %[step1_9]      \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_8]      \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_8]      \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_10]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_12]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_14]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
+          [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
+    );
+
+    input += 16;
+  }
+}
+
+void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+                                 int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct16_1d_rows_dspr2(input, out, 16);
+
+  // Then transform columns and add to dest
+  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+static void iadst16_1d(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+
+  int x0 = input[15];
+  int x1 = input[0];
+  int x2 = input[13];
+  int x3 = input[2];
+  int x4 = input[11];
+  int x5 = input[4];
+  int x6 = input[9];
+  int x7 = input[6];
+  int x8 = input[7];
+  int x9 = input[8];
+  int x10 = input[5];
+  int x11 = input[10];
+  int x12 = input[3];
+  int x13 = input[12];
+  int x14 = input[1];
+  int x15 = input[14];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = output[8]
+              = output[9] = output[10] = output[11] = output[12]
+              = output[13] = output[14] = output[15] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = dct_const_round_shift(s0 + s8);
+  x1 = dct_const_round_shift(s1 + s9);
+  x2 = dct_const_round_shift(s2 + s10);
+  x3 = dct_const_round_shift(s3 + s11);
+  x4 = dct_const_round_shift(s4 + s12);
+  x5 = dct_const_round_shift(s5 + s13);
+  x6 = dct_const_round_shift(s6 + s14);
+  x7 = dct_const_round_shift(s7 + s15);
+  x8  = dct_const_round_shift(s0 - s8);
+  x9  = dct_const_round_shift(s1 - s9);
+  x10 = dct_const_round_shift(s2 - s10);
+  x11 = dct_const_round_shift(s3 - s11);
+  x12 = dct_const_round_shift(s4 - s12);
+  x13 = dct_const_round_shift(s5 - s13);
+  x14 = dct_const_round_shift(s6 - s14);
+  x15 = dct_const_round_shift(s7 - s15);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
+  x4 = s0 - s4;
+  x5 = s1 - s5;
+  x6 = s2 - s6;
+  x7 = s3 - s7;
+  x8 = dct_const_round_shift(s8 + s12);
+  x9 = dct_const_round_shift(s9 + s13);
+  x10 = dct_const_round_shift(s10 + s14);
+  x11 = dct_const_round_shift(s11 + s15);
+  x12 = dct_const_round_shift(s8 - s12);
+  x13 = dct_const_round_shift(s9 - s13);
+  x14 = dct_const_round_shift(s10 - s14);
+  x15 = dct_const_round_shift(s11 - s15);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = dct_const_round_shift(s4 + s6);
+  x5 = dct_const_round_shift(s5 + s7);
+  x6 = dct_const_round_shift(s4 - s6);
+  x7 = dct_const_round_shift(s5 - s7);
+  x8 = s8 + s10;
+  x9 = s9 + s11;
+  x10 = s8 - s10;
+  x11 = s9 - s11;
+  x12 = dct_const_round_shift(s12 + s14);
+  x13 = dct_const_round_shift(s13 + s15);
+  x14 = dct_const_round_shift(s12 - s14);
+  x15 = dct_const_round_shift(s13 - s15);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = dct_const_round_shift(s2);
+  x3 = dct_const_round_shift(s3);
+  x6 = dct_const_round_shift(s6);
+  x7 = dct_const_round_shift(s7);
+  x10 = dct_const_round_shift(s10);
+  x11 = dct_const_round_shift(s11);
+  x14 = dct_const_round_shift(s14);
+  x15 = dct_const_round_shift(s15);
+
+  output[0] =  x0;
+  output[1] = -x8;
+  output[2] =  x12;
+  output[3] = -x4;
+  output[4] =  x6;
+  output[5] =  x14;
+  output[6] =  x10;
+  output[7] =  x2;
+  output[8] =  x3;
+  output[9] =  x11;
+  output[10] =  x15;
+  output[11] =  x7;
+  output[12] =  x5;
+  output[13] = -x13;
+  output[14] =  x9;
+  output[15] = -x1;
+}
+
+void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int pitch, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  int16_t *outptr = out;
+  int16_t temp_out[16];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:     // DCT in both horizontal and vertical
+      idct16_1d_rows_dspr2(input, outptr, 16);
+      idct16_1d_cols_add_blk_dspr2(out, dest, pitch);
+      break;
+    case ADST_DCT:    // ADST in vertical, DCT in horizontal
+      idct16_1d_rows_dspr2(input, outptr, 16);
+
+      outptr = out;
+
+      for (i = 0; i < 16; ++i) {
+        iadst16_1d(outptr, temp_out);
+
+        for (j = 0; j < 16; ++j)
+          dest[j * pitch + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                      + dest[j * pitch + i]);
+        outptr += 16;
+      }
+      break;
+    case DCT_ADST:    // DCT in vertical, ADST in horizontal
+    {
+      int16_t temp_in[16 * 16];
+
+      for (i = 0; i < 16; ++i) {
+        /* prefetch row */
+        vp9_prefetch_load((const uint8_t *)(input + 16));
+
+        iadst16_1d(input, outptr);
+        input += 16;
+        outptr += 16;
+      }
+
+      for (i = 0; i < 16; ++i)
+        for (j = 0; j < 16; ++j)
+            temp_in[j * 16 + i] = out[i * 16 + j];
+
+      idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch);
+    }
+    break;
+    case ADST_ADST:   // ADST in both directions
+    {
+      int16_t temp_in[16];
+
+      for (i = 0; i < 16; ++i) {
+        /* prefetch row */
+        vp9_prefetch_load((const uint8_t *)(input + 16));
+
+        iadst16_1d(input, outptr);
+        input += 16;
+        outptr += 16;
+      }
+
+      for (i = 0; i < 16; ++i) {
+        for (j = 0; j < 16; ++j)
+          temp_in[j] = out[j * 16 + i];
+        iadst16_1d(temp_in, temp_out);
+        for (j = 0; j < 16; ++j)
+          dest[j * pitch + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                      + dest[j * pitch + i]);
+      }
+    }
+    break;
+    default:
+      printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+
+void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  int16_t *outptr = out;
+  uint32_t i;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  idct16_1d_rows_dspr2(input, outptr, 4);
+
+  outptr += 4;
+  for (i = 0; i < 6; ++i) {
+    __asm__ __volatile__ (
+        "sw     $zero,    0(%[outptr])     \n\t"
+        "sw     $zero,   32(%[outptr])     \n\t"
+        "sw     $zero,   64(%[outptr])     \n\t"
+        "sw     $zero,   96(%[outptr])     \n\t"
+        "sw     $zero,  128(%[outptr])     \n\t"
+        "sw     $zero,  160(%[outptr])     \n\t"
+        "sw     $zero,  192(%[outptr])     \n\t"
+        "sw     $zero,  224(%[outptr])     \n\t"
+        "sw     $zero,  256(%[outptr])     \n\t"
+        "sw     $zero,  288(%[outptr])     \n\t"
+        "sw     $zero,  320(%[outptr])     \n\t"
+        "sw     $zero,  352(%[outptr])     \n\t"
+        "sw     $zero,  384(%[outptr])     \n\t"
+        "sw     $zero,  416(%[outptr])     \n\t"
+        "sw     $zero,  448(%[outptr])     \n\t"
+        "sw     $zero,  480(%[outptr])     \n\t"
+
+        :
+        : [outptr] "r" (outptr)
+    );
+
+    outptr += 2;
+  }
+
+  // Then transform columns
+  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                               int dest_stride) {
+  uint32_t pos = 45;
+  int32_t out;
+  int32_t r;
+  int32_t a1, absa1;
+  int32_t vector_a1;
+  int32_t t1, t2, t3, t4;
+  int32_t vector_1, vector_2, vector_3, vector_4;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],     32      \n\t"
+      "sra      %[a1],      %[out],     6       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],       %[a1]       \n\t"
+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
new file mode 100644
index 0000000..5e92db3
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
@@ -0,0 +1,1073 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                      int dest_stride) {
+  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
+  int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
+  int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
+  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+  int16_t step2_28, step2_29, step2_30, step2_31;
+  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
+  int16_t step3_28, step3_29, step3_30, step3_31;
+  int temp0, temp1, temp2, temp3;
+  int load1, load2, load3, load4;
+  int result1, result2;
+  int i, temp21;
+  uint8_t *dest_pix, *dest_pix1;
+  const int const_2_power_13 = 8192;
+  uint8_t *cm = vp9_ff_cropTbl;
+
+  /* prefetch vp9_ff_cropTbl */
+  vp9_prefetch_load(vp9_ff_cropTbl);
+  vp9_prefetch_load(vp9_ff_cropTbl +  32);
+  vp9_prefetch_load(vp9_ff_cropTbl +  64);
+  vp9_prefetch_load(vp9_ff_cropTbl +  96);
+  vp9_prefetch_load(vp9_ff_cropTbl + 128);
+  vp9_prefetch_load(vp9_ff_cropTbl + 160);
+  vp9_prefetch_load(vp9_ff_cropTbl + 192);
+  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+  for (i = 0; i < 32; ++i) {
+    dest_pix = dest + i;
+    dest_pix1 = dest + i + 31 * dest_stride;
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             2(%[input])                     \n\t"
+        "lh       %[load2],             62(%[input])                    \n\t"
+        "lh       %[load3],             34(%[input])                    \n\t"
+        "lh       %[load4],             30(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_17],          $ac1,           31              \n\t"
+        "extp     %[step1_30],          $ac3,           31              \n\t"
+        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             18(%[input])                    \n\t"
+        "lh       %[load2],             46(%[input])                    \n\t"
+        "lh       %[load3],             50(%[input])                    \n\t"
+        "lh       %[load4],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_18],          $ac1,           31              \n\t"
+        "extp     %[step1_29],          $ac3,           31              \n\t"
+        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             54(%[input])                    \n\t"
+        "lh       %[load3],             42(%[input])                    \n\t"
+        "lh       %[load4],             22(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+
+        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
+
+        "extp     %[step1_21],          $ac1,           31              \n\t"
+        "extp     %[step1_26],          $ac3,           31              \n\t"
+        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             26(%[input])                    \n\t"
+        "lh       %[load2],             38(%[input])                    \n\t"
+        "lh       %[load3],             58(%[input])                    \n\t"
+        "lh       %[load4],              6(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_22],          $ac1,           31              \n\t"
+        "extp     %[step1_25],          $ac3,           31              \n\t"
+        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              4(%[input])                    \n\t"
+        "lh       %[load2],             60(%[input])                    \n\t"
+        "lh       %[load3],             36(%[input])                    \n\t"
+        "lh       %[load4],             28(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             20(%[input])                    \n\t"
+        "lh       %[load2],             44(%[input])                    \n\t"
+        "lh       %[load3],             52(%[input])                    \n\t"
+        "lh       %[load4],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
+        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
+        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
+
+        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
+        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
+        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
+        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
+        "extp     %[step3_10],          $ac0,           31              \n\t"
+        "extp     %[step3_13],          $ac1,           31              \n\t"
+        "extp     %[step3_11],          $ac2,           31              \n\t"
+        "extp     %[step3_12],          $ac3,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_11] "r" (step2_11), [step2_12] "r" (step2_12),
+          [step2_13] "r" (step2_13), [step2_14] "r" (step2_14),
+          [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_18 = step1_17 - step1_18;
+    step2_29 = step1_30 - step1_29;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_18],          $ac0,           31              \n\t"
+
+        : [step3_18] "=r" (step3_18)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_19 = step1_16 - step1_19;
+    step2_28 = step1_31 - step1_28;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_19],          $ac0,           31              \n\t"
+
+        : [step3_19] "=r" (step3_19)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_16 = step1_16 + step1_19;
+    step3_17 = step1_17 + step1_18;
+    step3_30 = step1_29 + step1_30;
+    step3_31 = step1_28 + step1_31;
+
+    step2_20 = step1_23 - step1_20;
+    step2_27 = step1_24 - step1_27;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_20],          $ac0,           31              \n\t"
+
+        : [step3_20] "=r" (step3_20)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_21 = step1_22 - step1_21;
+    step2_26 = step1_25 - step1_26;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_21],          $ac1,           31              \n\t"
+
+        : [step3_21] "=r" (step3_21)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_22 = step1_21 + step1_22;
+    step3_23 = step1_20 + step1_23;
+    step3_24 = step1_24 + step1_27;
+    step3_25 = step1_25 + step1_26;
+
+    step2_16 = step3_16 + step3_23;
+    step2_17 = step3_17 + step3_22;
+    step2_18 = step3_18 + step3_21;
+    step2_19 = step3_19 + step3_20;
+    step2_20 = step3_19 - step3_20;
+    step2_21 = step3_18 - step3_21;
+    step2_22 = step3_17 - step3_22;
+    step2_23 = step3_16 - step3_23;
+
+    step2_24 = step3_31 - step3_24;
+    step2_25 = step3_30 - step3_25;
+    step2_26 = step3_29 - step3_26;
+    step2_27 = step3_28 - step3_27;
+    step2_28 = step3_28 + step3_27;
+    step2_29 = step3_29 + step3_26;
+    step2_30 = step3_30 + step3_25;
+    step2_31 = step3_31 + step3_24;
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             0(%[input])                     \n\t"
+        "lh       %[load2],             32(%[input])                    \n\t"
+        "lh       %[load3],             16(%[input])                    \n\t"
+        "lh       %[load4],             48(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[temp2],             $ac3,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[temp3],             $ac1,           31              \n\t"
+        "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"
+        "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"
+        "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
+        "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             8(%[input])                     \n\t"
+        "lh       %[load2],             56(%[input])                    \n\t"
+        "lh       %[load3],             40(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
+        "add      %[load1],             %[load1],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
+        "add      %[load2],             %[load2],       %[temp3]        \n\t"
+        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_0 = step1_0 + step1_7;
+    step2_1 = step1_1 + step1_6;
+    step2_2 = step1_2 + step1_5;
+    step2_3 = step1_3 + step1_4;
+    step2_4 = step1_3 - step1_4;
+    step2_5 = step1_2 - step1_5;
+    step2_6 = step1_1 - step1_6;
+    step2_7 = step1_0 - step1_7;
+
+    // stage 7
+    step1_0 = step2_0 + step3_15;
+    step1_1 = step2_1 + step3_14;
+    step1_2 = step2_2 + step3_13;
+    step1_3 = step2_3 + step3_12;
+    step1_4 = step2_4 + step3_11;
+    step1_5 = step2_5 + step3_10;
+    step1_6 = step2_6 + step3_9;
+    step1_7 = step2_7 + step3_8;
+    step1_8 = step2_7 - step3_8;
+    step1_9 = step2_6 - step3_9;
+    step1_10 = step2_5 - step3_10;
+    step1_11 = step2_4 - step3_11;
+    step1_12 = step2_3 - step3_12;
+    step1_13 = step2_2 - step3_13;
+    step1_14 = step2_1 - step3_14;
+    step1_15 = step2_0 - step3_15;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_20],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20),
+          [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_20 + step2_27) * cospi_16_64;
+    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_21],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26),
+          [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_21 + step2_26) * cospi_16_64;
+    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_22],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25),
+          [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_22 + step2_25) * cospi_16_64;
+    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_23],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24),
+          [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_23 + step2_24) * cospi_16_64;
+    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_0],         %[step2_31]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_1],         %[step2_30]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_2],         %[step2_29]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_3],         %[step2_28]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),
+          [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_4],         %[step1_27]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_5],         %[step1_26]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_6],         %[step1_25]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_7],         %[step1_24]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),
+          [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_8],         %[step1_23]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_9],         %[step1_22]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_10],        %[step1_21]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_11],        %[step1_20]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),
+          [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_12],        %[step2_19]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_13],        %[step2_18]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_14],        %[step2_17]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_15],        %[step2_16]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15),
+          [step2_16] "r" (step2_16), [step2_17] "r" (step2_17),
+          [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    input += 32;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
new file mode 100644
index 0000000..d3aee73
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
@@ -0,0 +1,1013 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+  int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+  int16_t step1_28, step1_29, step1_30, step1_31;
+  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+  int16_t step2_28, step2_29, step2_30, step2_31;
+  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+  int16_t step3_29, step3_30, step3_31;
+  int temp0, temp1, temp2, temp3;
+  int load1, load2, load3, load4;
+  int result1, result2;
+  int temp21;
+  int i;
+  const int const_2_power_13 = 8192;
+  const int32_t *input_int;
+
+  for (i = 32; i--; ) {
+    input_int = (const int32_t *)input;
+
+    if (!(input_int[0]  | input_int[1]  | input_int[2]  | input_int[3]  |
+          input_int[4]  | input_int[5]  | input_int[6]  | input_int[7]  |
+          input_int[8]  | input_int[9]  | input_int[10] | input_int[11] |
+          input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
+      input += 32;
+
+      __asm__ __volatile__ (
+          "sh     $zero,     0(%[output])     \n\t"
+          "sh     $zero,    64(%[output])     \n\t"
+          "sh     $zero,   128(%[output])     \n\t"
+          "sh     $zero,   192(%[output])     \n\t"
+          "sh     $zero,   256(%[output])     \n\t"
+          "sh     $zero,   320(%[output])     \n\t"
+          "sh     $zero,   384(%[output])     \n\t"
+          "sh     $zero,   448(%[output])     \n\t"
+          "sh     $zero,   512(%[output])     \n\t"
+          "sh     $zero,   576(%[output])     \n\t"
+          "sh     $zero,   640(%[output])     \n\t"
+          "sh     $zero,   704(%[output])     \n\t"
+          "sh     $zero,   768(%[output])     \n\t"
+          "sh     $zero,   832(%[output])     \n\t"
+          "sh     $zero,   896(%[output])     \n\t"
+          "sh     $zero,   960(%[output])     \n\t"
+          "sh     $zero,  1024(%[output])     \n\t"
+          "sh     $zero,  1088(%[output])     \n\t"
+          "sh     $zero,  1152(%[output])     \n\t"
+          "sh     $zero,  1216(%[output])     \n\t"
+          "sh     $zero,  1280(%[output])     \n\t"
+          "sh     $zero,  1344(%[output])     \n\t"
+          "sh     $zero,  1408(%[output])     \n\t"
+          "sh     $zero,  1472(%[output])     \n\t"
+          "sh     $zero,  1536(%[output])     \n\t"
+          "sh     $zero,  1600(%[output])     \n\t"
+          "sh     $zero,  1664(%[output])     \n\t"
+          "sh     $zero,  1728(%[output])     \n\t"
+          "sh     $zero,  1792(%[output])     \n\t"
+          "sh     $zero,  1856(%[output])     \n\t"
+          "sh     $zero,  1920(%[output])     \n\t"
+          "sh     $zero,  1984(%[output])     \n\t"
+
+          :
+          : [output] "r" (output)
+      );
+
+      output += 1;
+
+      continue;
+    }
+
+    /* prefetch row */
+    vp9_prefetch_load((const uint8_t *)(input + 32));
+    vp9_prefetch_load((const uint8_t *)(input + 48));
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             2(%[input])                     \n\t"
+        "lh       %[load2],             62(%[input])                    \n\t"
+        "lh       %[load3],             34(%[input])                    \n\t"
+        "lh       %[load4],             30(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_17],          $ac1,           31              \n\t"
+        "extp     %[step1_30],          $ac3,           31              \n\t"
+        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             18(%[input])                    \n\t"
+        "lh       %[load2],             46(%[input])                    \n\t"
+        "lh       %[load3],             50(%[input])                    \n\t"
+        "lh       %[load4],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_18],          $ac1,           31              \n\t"
+        "extp     %[step1_29],          $ac3,           31              \n\t"
+        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             54(%[input])                    \n\t"
+        "lh       %[load3],             42(%[input])                    \n\t"
+        "lh       %[load4],             22(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+
+        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
+
+        "extp     %[step1_21],          $ac1,           31              \n\t"
+        "extp     %[step1_26],          $ac3,           31              \n\t"
+        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             26(%[input])                    \n\t"
+        "lh       %[load2],             38(%[input])                    \n\t"
+        "lh       %[load3],             58(%[input])                    \n\t"
+        "lh       %[load4],              6(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
+
+        "extp     %[step1_22],          $ac1,           31              \n\t"
+        "extp     %[step1_25],          $ac3,           31              \n\t"
+        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              4(%[input])                    \n\t"
+        "lh       %[load2],             60(%[input])                    \n\t"
+        "lh       %[load3],             36(%[input])                    \n\t"
+        "lh       %[load4],             28(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             20(%[input])                    \n\t"
+        "lh       %[load2],             44(%[input])                    \n\t"
+        "lh       %[load3],             52(%[input])                    \n\t"
+        "lh       %[load4],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
+        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
+        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
+
+        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
+        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
+        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
+        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
+
+        "extp     %[step3_10],          $ac0,           31              \n\t"
+        "extp     %[step3_13],          $ac1,           31              \n\t"
+        "extp     %[step3_11],          $ac2,           31              \n\t"
+        "extp     %[step3_12],          $ac3,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_18 = step1_17 - step1_18;
+    step2_29 = step1_30 - step1_29;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_18],          $ac0,           31              \n\t"
+
+        : [step3_18] "=r" (step3_18)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_19 = step1_16 - step1_19;
+    step2_28 = step1_31 - step1_28;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_19],          $ac0,           31              \n\t"
+
+        : [step3_19] "=r" (step3_19)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_16 = step1_16 + step1_19;
+    step3_17 = step1_17 + step1_18;
+    step3_30 = step1_29 + step1_30;
+    step3_31 = step1_28 + step1_31;
+
+    step2_20 = step1_23 - step1_20;
+    step2_27 = step1_24 - step1_27;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_20],          $ac0,           31              \n\t"
+
+        : [step3_20] "=r" (step3_20)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_21 = step1_22 - step1_21;
+    step2_26 = step1_25 - step1_26;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_21],          $ac1,           31              \n\t"
+
+        : [step3_21] "=r" (step3_21)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_22 = step1_21 + step1_22;
+    step3_23 = step1_20 + step1_23;
+    step3_24 = step1_24 + step1_27;
+    step3_25 = step1_25 + step1_26;
+
+    step2_16 = step3_16 + step3_23;
+    step2_17 = step3_17 + step3_22;
+    step2_18 = step3_18 + step3_21;
+    step2_19 = step3_19 + step3_20;
+    step2_20 = step3_19 - step3_20;
+    step2_21 = step3_18 - step3_21;
+    step2_22 = step3_17 - step3_22;
+    step2_23 = step3_16 - step3_23;
+
+    step2_24 = step3_31 - step3_24;
+    step2_25 = step3_30 - step3_25;
+    step2_26 = step3_29 - step3_26;
+    step2_27 = step3_28 - step3_27;
+    step2_28 = step3_28 + step3_27;
+    step2_29 = step3_29 + step3_26;
+    step2_30 = step3_30 + step3_25;
+    step2_31 = step3_31 + step3_24;
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             0(%[input])                     \n\t"
+        "lh       %[load2],             32(%[input])                    \n\t"
+        "lh       %[load3],             16(%[input])                    \n\t"
+        "lh       %[load4],             48(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[temp2],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[temp3],             $ac1,           31              \n\t"
+
+        "add      %[step1_0],          %[temp0],        %[temp3]        \n\t"
+        "add      %[step1_1],          %[temp1],        %[temp2]        \n\t"
+        "sub      %[step1_2],          %[temp1],        %[temp2]        \n\t"
+        "sub      %[step1_3],          %[temp0],        %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_16_64] "r" (cospi_16_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             8(%[input])                     \n\t"
+        "lh       %[load2],             56(%[input])                    \n\t"
+        "lh       %[load3],             40(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
+        "add      %[load1],             %[load1],       %[temp1]        \n\t"
+
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
+        "add      %[load2],             %[load2],       %[temp3]        \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_0 = step1_0 + step1_7;
+    step2_1 = step1_1 + step1_6;
+    step2_2 = step1_2 + step1_5;
+    step2_3 = step1_3 + step1_4;
+    step2_4 = step1_3 - step1_4;
+    step2_5 = step1_2 - step1_5;
+    step2_6 = step1_1 - step1_6;
+    step2_7 = step1_0 - step1_7;
+
+    step1_0 = step2_0 + step3_15;
+    step1_1 = step2_1 + step3_14;
+    step1_2 = step2_2 + step3_13;
+    step1_3 = step2_3 + step3_12;
+    step1_4 = step2_4 + step3_11;
+    step1_5 = step2_5 + step3_10;
+    step1_6 = step2_6 + step3_9;
+    step1_7 = step2_7 + step3_8;
+    step1_8 = step2_7 - step3_8;
+    step1_9 = step2_6 - step3_9;
+    step1_10 = step2_5 - step3_10;
+    step1_11 = step2_4 - step3_11;
+    step1_12 = step2_3 - step3_12;
+    step1_13 = step2_2 - step3_13;
+    step1_14 = step2_1 - step3_14;
+    step1_15 = step2_0 - step3_15;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_20],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_20 + step2_27) * cospi_16_64;
+    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_21],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_21 + step2_26) * cospi_16_64;
+    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_22],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_22 + step2_25) * cospi_16_64;
+    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_23],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_23 + step2_24) * cospi_16_64;
+    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    // final stage
+    output[0 * 32] = step1_0 + step2_31;
+    output[1 * 32] = step1_1 + step2_30;
+    output[2 * 32] = step1_2 + step2_29;
+    output[3 * 32] = step1_3 + step2_28;
+    output[4 * 32] = step1_4 + step1_27;
+    output[5 * 32] = step1_5 + step1_26;
+    output[6 * 32] = step1_6 + step1_25;
+    output[7 * 32] = step1_7 + step1_24;
+    output[8 * 32] = step1_8 + step1_23;
+    output[9 * 32] = step1_9 + step1_22;
+    output[10 * 32] = step1_10 + step1_21;
+    output[11 * 32] = step1_11 + step1_20;
+    output[12 * 32] = step1_12 + step2_19;
+    output[13 * 32] = step1_13 + step2_18;
+    output[14 * 32] = step1_14 + step2_17;
+    output[15 * 32] = step1_15 + step2_16;
+    output[16 * 32] = step1_15 - step2_16;
+    output[17 * 32] = step1_14 - step2_17;
+    output[18 * 32] = step1_13 - step2_18;
+    output[19 * 32] = step1_12 - step2_19;
+    output[20 * 32] = step1_11 - step1_20;
+    output[21 * 32] = step1_10 - step1_21;
+    output[22 * 32] = step1_9 - step1_22;
+    output[23 * 32] = step1_8 - step1_23;
+    output[24 * 32] = step1_7 - step1_24;
+    output[25 * 32] = step1_6 - step1_25;
+    output[26 * 32] = step1_5 - step1_26;
+    output[27 * 32] = step1_4 - step1_27;
+    output[28 * 32] = step1_3 - step2_28;
+    output[29 * 32] = step1_2 - step2_29;
+    output[30 * 32] = step1_1 - step2_30;
+    output[31 * 32] = step1_0 - step2_31;
+
+    input += 32;
+    output += 1;
+  }
+}
+
+void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  idct32_1d_rows_dspr2(input, outptr);
+
+  // Columns
+  vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  int       r, out;
+  int32_t   a1, absa1;
+  int32_t   vector_a1;
+  int32_t   t1, t2, t3, t4;
+  int32_t   vector_1, vector_2, vector_3, vector_4;
+  uint32_t  pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],    %[out],    32      \n\t"
+      "sra      %[a1],     %[out],    6       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],     %[a1]         \n\t"
+        "replv.qb   %[vector_a1], %[absa1]      \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 32; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+
+          "lw             %[t1],          16(%[dest])                     \n\t"
+          "lw             %[t2],          20(%[dest])                     \n\t"
+          "lw             %[t3],          24(%[dest])                     \n\t"
+          "lw             %[t4],          28(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    16(%[dest])                     \n\t"
+          "sw             %[vector_2],    20(%[dest])                     \n\t"
+          "sw             %[vector_3],    24(%[dest])                     \n\t"
+          "sw             %[vector_4],    28(%[dest])                     \n\t"
+
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb       %[vector_a1],   %[a1]     \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 32; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+
+          "lw             %[t1],          16(%[dest])                     \n\t"
+          "lw             %[t2],          20(%[dest])                     \n\t"
+          "lw             %[t3],          24(%[dest])                     \n\t"
+          "lw             %[t4],          28(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    16(%[dest])                     \n\t"
+          "sw             %[vector_2],    20(%[dest])                     \n\t"
+          "sw             %[vector_3],    24(%[dest])                     \n\t"
+          "sw             %[vector_4],    28(%[dest])                     \n\t"
+
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
new file mode 100644
index 0000000..5b7aa5e
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -0,0 +1,438 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+  int16_t   step_0, step_1, step_2, step_3;
+  int       Temp0, Temp1, Temp2, Temp3;
+  const int const_2_power_13 = 8192;
+  int       i;
+
+  for (i = 4; i--; ) {
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[2]) * cospi_16_64;
+          step_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[2]) * cospi_16_64;
+          step_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             4(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "extp     %[step_0],            $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "extp     %[step_1],            $ac1,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        /*
+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+          step_2 = dct_const_round_shift(temp1);
+        */
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step_2],            $ac0,           31              \n\t"
+
+        /*
+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+          step_3 = dct_const_round_shift(temp2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step_3],            $ac1,           31              \n\t"
+
+        /*
+          output[0]  = step_0 + step_3;
+          output[4]  = step_1 + step_2;
+          output[8]  = step_1 - step_2;
+          output[12] = step_0 - step_3;
+        */
+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "sh       %[Temp0],             0(%[output])                    \n\t"
+
+        "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"
+        "sh       %[Temp1],             8(%[output])                    \n\t"
+
+        "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"
+        "sh       %[Temp2],             16(%[output])                   \n\t"
+
+        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
+        "sh       %[Temp3],             24(%[output])                   \n\t"
+
+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+        [output] "+r" (output)
+      : [const_2_power_13] "r" (const_2_power_13),
+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+        [cospi_24_64] "r" (cospi_24_64),
+        [input] "r" (input)
+    );
+
+    input += 4;
+    output += 1;
+  }
+}
+
+static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                               int dest_stride) {
+  int16_t   step_0, step_1, step_2, step_3;
+  int       Temp0, Temp1, Temp2, Temp3;
+  const int const_2_power_13 = 8192;
+  int       i;
+  uint8_t   *dest_pix;
+  uint8_t   *cm = vp9_ff_cropTbl;
+
+  /* prefetch vp9_ff_cropTbl */
+  vp9_prefetch_load(vp9_ff_cropTbl);
+  vp9_prefetch_load(vp9_ff_cropTbl +  32);
+  vp9_prefetch_load(vp9_ff_cropTbl +  64);
+  vp9_prefetch_load(vp9_ff_cropTbl +  96);
+  vp9_prefetch_load(vp9_ff_cropTbl + 128);
+  vp9_prefetch_load(vp9_ff_cropTbl + 160);
+  vp9_prefetch_load(vp9_ff_cropTbl + 192);
+  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+  for (i = 0; i < 4; ++i) {
+      dest_pix = (dest + i);
+
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[2]) * cospi_16_64;
+          step_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[2]) * cospi_16_64;
+          step_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             4(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "extp     %[step_0],            $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "extp     %[step_1],            $ac1,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        /*
+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+          step_2 = dct_const_round_shift(temp1);
+        */
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step_2],            $ac0,           31              \n\t"
+
+        /*
+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+          step_3 = dct_const_round_shift(temp2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step_3],            $ac1,           31              \n\t"
+
+        /*
+          output[0]  = step_0 + step_3;
+          output[4]  = step_1 + step_2;
+          output[8]  = step_1 - step_2;
+          output[12] = step_0 - step_3;
+        */
+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+
+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+        [dest_pix] "+r" (dest_pix)
+      : [const_2_power_13] "r" (const_2_power_13),
+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+        [cospi_24_64] "r" (cospi_24_64),
+        [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+    );
+
+    input += 4;
+  }
+}
+
+void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  vp9_idct4_1d_rows_dspr2(input, outptr);
+
+  // Columns
+  vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride) {
+  int       a1, absa1;
+  int       r;
+  int32_t   out;
+  int       t2, vector_a1, vector_a;
+  uint32_t  pos = 45;
+  int16_t   input_dc = input[0];
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],    8       \n\t"
+      "sra      %[a1],      %[out],    4       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],     %[a1]         \n\t"
+        "replv.qb   %[vector_a1], %[absa1]      \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t2],          0(%[dest])                      \n\t"
+          "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
+          "sw             %[vector_a],    0(%[dest])                      \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb       %[vector_a1],   %[a1]     \n\t"
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t2],          0(%[dest])                        \n\t"
+          "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
+          "sw           %[vector_a],    0(%[dest])                        \n\t"
+          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
+
+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+
+static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+  int x0, x1, x2, x3;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = x0 - x2 + x3;
+
+  x0 = s0 + s3 + s5;
+  x1 = s1 - s4 - s6;
+  x2 = sinpi_3_9 * s7;
+  x3 = s2;
+
+  s0 = x0 + x3;
+  s1 = x1 + x3;
+  s2 = x2;
+  s3 = x0 + x1 - x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = dct_const_round_shift(s0);
+  output[1] = dct_const_round_shift(s1);
+  output[2] = dct_const_round_shift(s2);
+  output[3] = dct_const_round_shift(s3);
+}
+
+void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+  int16_t *outptr = out;
+  int16_t temp_in[4 * 4], temp_out[4];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:   // DCT in both horizontal and vertical
+      vp9_idct4_1d_rows_dspr2(input, outptr);
+      vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      break;
+    case ADST_DCT:  // ADST in vertical, DCT in horizontal
+      vp9_idct4_1d_rows_dspr2(input, outptr);
+
+      outptr = out;
+
+      for (i = 0; i < 4; ++i) {
+        iadst4_1d_dspr2(outptr, temp_out);
+
+        for (j = 0; j < 4; ++j)
+          dest[j * dest_stride + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                      + dest[j * dest_stride + i]);
+
+        outptr += 4;
+      }
+      break;
+    case DCT_ADST:  // DCT in vertical, ADST in horizontal
+      for (i = 0; i < 4; ++i) {
+        iadst4_1d_dspr2(input, outptr);
+        input  += 4;
+        outptr += 4;
+      }
+
+      for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j) {
+          temp_in[i * 4 + j] = out[j * 4 + i];
+        }
+      }
+      vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      break;
+    case ADST_ADST:  // ADST in both directions
+      for (i = 0; i < 4; ++i) {
+        iadst4_1d_dspr2(input, outptr);
+        input  += 4;
+        outptr += 4;
+      }
+
+      for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j)
+          temp_in[j] = out[j * 4 + i];
+        iadst4_1d_dspr2(temp_in, temp_out);
+
+        for (j = 0; j < 4; ++j)
+          dest[j * dest_stride + i] =
+                  clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    default:
+      printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
new file mode 100644
index 0000000..93a0840
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -0,0 +1,745 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output,
+                                uint32_t no_rows) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  const int const_2_power_13 = 8192;
+  int Temp0, Temp1, Temp2, Temp3, Temp4;
+  int i;
+
+  for (i = no_rows; i--; ) {
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[4]) * cospi_16_64;
+          step2_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[4]) * cospi_16_64;
+          step2_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             8(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "extp     %[Temp4],             $ac0,           31              \n\t"
+
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "extp     %[Temp2],             $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+          step2_2 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             4(%[input])                     \n\t"
+        "lh       %[Temp1],             12(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "extp     %[Temp3],             $ac0,           31              \n\t"
+
+        /*
+          step1_1 = step2_1 + step2_2;
+          step1_2 = step2_1 - step2_2;
+        */
+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
+
+        /*
+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+          step2_3 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[Temp1],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        /*
+          step1_0 = step2_0 + step2_3;
+          step1_3 = step2_0 - step2_3;
+        */
+        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
+        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
+
+        /*
+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+          step1_4 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp1],             14(%[input])                    \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
+        "extp     %[step1_4],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+          step1_7 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
+        "extp     %[step1_7],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+          step1_5 = dct_const_round_shift(temp_1);
+        */
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+          step1_6 = dct_const_round_shift(temp_2);
+        */
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+        */
+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
+
+        /*
+          step1_4 = step1_4 + step1_5;
+          step1_7 = step1_6 + step1_7;
+        */
+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
+
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "sh       %[Temp0],             0(%[output])                    \n\t"
+        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
+        "sh       %[Temp1],             16(%[output])                   \n\t"
+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "sh       %[Temp0],             32(%[output])                   \n\t"
+        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
+        "sh       %[Temp1],             48(%[output])                   \n\t"
+
+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "sh       %[Temp0],             64(%[output])                   \n\t"
+        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
+        "sh       %[Temp1],             80(%[output])                   \n\t"
+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "sh       %[Temp0],             96(%[output])                   \n\t"
+        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
+        "sh       %[Temp1],             112(%[output])                  \n\t"
+
+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [Temp4] "=&r" (Temp4)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_24_64] "r" (cospi_24_64),
+          [output] "r" (output), [input] "r" (input)
+    );
+
+    input += 8;
+    output += 1;
+  }
+}
+
+static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                           int dest_stride) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int Temp0, Temp1, Temp2, Temp3;
+  int i;
+  const int const_2_power_13 = 8192;
+  uint8_t *dest_pix;
+  uint8_t *cm = vp9_ff_cropTbl;
+
+  /* prefetch vp9_ff_cropTbl */
+  vp9_prefetch_load(vp9_ff_cropTbl);
+  vp9_prefetch_load(vp9_ff_cropTbl +  32);
+  vp9_prefetch_load(vp9_ff_cropTbl +  64);
+  vp9_prefetch_load(vp9_ff_cropTbl +  96);
+  vp9_prefetch_load(vp9_ff_cropTbl + 128);
+  vp9_prefetch_load(vp9_ff_cropTbl + 160);
+  vp9_prefetch_load(vp9_ff_cropTbl + 192);
+  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+  for (i = 0; i < 8; ++i) {
+      dest_pix = (dest + i);
+
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[4]) * cospi_16_64;
+          step2_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[4]) * cospi_16_64;
+          step2_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             8(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_6],           $ac0,           31              \n\t"
+
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "extp     %[Temp2],             $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+          step2_2 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             4(%[input])                     \n\t"
+        "lh       %[Temp1],             12(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "extp     %[Temp3],             $ac0,           31              \n\t"
+
+        /*
+          step1_1 = step2_1 + step2_2;
+          step1_2 = step2_1 - step2_2;
+        */
+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
+
+        /*
+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+          step2_3 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[Temp1],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        /*
+          step1_0 = step2_0 + step2_3;
+          step1_3 = step2_0 - step2_3;
+        */
+        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
+        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
+
+        /*
+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+          step1_4 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp1],             14(%[input])                    \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
+        "extp     %[step1_4],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+          step1_7 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
+        "extp     %[step1_7],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+          step1_5 = dct_const_round_shift(temp_1);
+        */
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+          step1_6 = dct_const_round_shift(temp_2);
+        */
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+        */
+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
+
+        /*
+          step1_4 = step1_4 + step1_5;
+          step1_7 = step1_6 + step1_7;
+        */
+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
+
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /* add block */
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+
+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [dest_pix] "+r" (dest_pix)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_24_64] "r" (cospi_24_64),
+          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+    );
+
+    input += 8;
+  }
+}
+
+void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct8_1d_rows_dspr2(input, outptr, 8);
+
+  // Then transform columns and add to dest
+  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+  int x0, x1, x2, x3, x4, x5, x6, x7;
+
+  x0 = input[7];
+  x1 = input[0];
+  x2 = input[5];
+  x3 = input[2];
+  x4 = input[3];
+  x5 = input[4];
+  x6 = input[1];
+  x7 = input[6];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
+  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
+  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
+  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
+  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
+  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
+  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
+  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
+
+  output[0] =  x0;
+  output[1] = -x4;
+  output[2] =  x6;
+  output[3] = -x2;
+  output[4] =  x3;
+  output[5] = -x7;
+  output[6] =  x5;
+  output[7] = -x1;
+}
+
+void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  int16_t temp_in[8 * 8], temp_out[8];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:     // DCT in both horizontal and vertical
+      idct8_1d_rows_dspr2(input, outptr, 8);
+      idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      break;
+    case ADST_DCT:    // ADST in vertical, DCT in horizontal
+      idct8_1d_rows_dspr2(input, outptr, 8);
+
+      for (i = 0; i < 8; ++i) {
+        iadst8_1d_dspr2(&out[i * 8], temp_out);
+
+        for (j = 0; j < 8; ++j)
+          dest[j * dest_stride + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    case DCT_ADST:    // DCT in vertical, ADST in horizontal
+      for (i = 0; i < 8; ++i) {
+        iadst8_1d_dspr2(input, outptr);
+        input += 8;
+        outptr += 8;
+      }
+
+      for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j) {
+          temp_in[i * 8 + j] = out[j * 8 + i];
+        }
+      }
+      idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      break;
+    case ADST_ADST:   // ADST in both directions
+      for (i = 0; i < 8; ++i) {
+        iadst8_1d_dspr2(input, outptr);
+        input += 8;
+        outptr += 8;
+      }
+
+      for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j)
+          temp_in[j] = out[j * 8 + i];
+
+        iadst8_1d_dspr2(temp_in, temp_out);
+
+        for (j = 0; j < 8; ++j)
+          dest[j * dest_stride + i] =
+                clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    default:
+      printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+
+void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct8_1d_rows_dspr2(input, outptr, 4);
+
+  outptr += 4;
+
+  __asm__ __volatile__ (
+      "sw  $zero,   0(%[outptr])  \n\t"
+      "sw  $zero,   4(%[outptr])  \n\t"
+      "sw  $zero,  16(%[outptr])  \n\t"
+      "sw  $zero,  20(%[outptr])  \n\t"
+      "sw  $zero,  32(%[outptr])  \n\t"
+      "sw  $zero,  36(%[outptr])  \n\t"
+      "sw  $zero,  48(%[outptr])  \n\t"
+      "sw  $zero,  52(%[outptr])  \n\t"
+      "sw  $zero,  64(%[outptr])  \n\t"
+      "sw  $zero,  68(%[outptr])  \n\t"
+      "sw  $zero,  80(%[outptr])  \n\t"
+      "sw  $zero,  84(%[outptr])  \n\t"
+      "sw  $zero,  96(%[outptr])  \n\t"
+      "sw  $zero, 100(%[outptr])  \n\t"
+      "sw  $zero, 112(%[outptr])  \n\t"
+      "sw  $zero, 116(%[outptr])  \n\t"
+
+      :
+      : [outptr] "r" (outptr)
+  );
+
+
+  // Then transform columns and add to dest
+  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride) {
+  uint32_t pos = 45;
+  int32_t out;
+  int32_t r;
+  int32_t a1, absa1;
+  int32_t t1, t2, vector_a1, vector_1, vector_2;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],     16      \n\t"
+      "sra      %[a1],      %[out],     5       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],       %[a1]       \n\t"
+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t1],          0(%[dest])                      \n\t"
+          "lw           %[t2],          4(%[dest])                      \n\t"
+          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "sw           %[vector_1],    0(%[dest])                      \n\t"
+          "sw           %[vector_2],    4(%[dest])                      \n\t"
+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t1],          0(%[dest])                      \n\t"
+          "lw           %[t2],          4(%[dest])                      \n\t"
+          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "sw           %[vector_1],    0(%[dest])                      \n\t"
+          "sw           %[vector_2],    4(%[dest])                      \n\t"
+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [dest] "+r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c
index 864e27e..0d65651 100644
--- a/libvpx/vp9/common/vp9_alloccommon.c
+++ b/libvpx/vp9/common/vp9_alloccommon.c
@@ -41,30 +41,25 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) {
 
   vpx_free(cm->mip);
   vpx_free(cm->prev_mip);
-  vpx_free(cm->above_seg_context);
   vpx_free(cm->last_frame_seg_map);
   vpx_free(cm->mi_grid_base);
   vpx_free(cm->prev_mi_grid_base);
 
-  vpx_free(cm->above_context[0]);
-  for (i = 0; i < MAX_MB_PLANE; i++)
-    cm->above_context[i] = 0;
   cm->mip = NULL;
   cm->prev_mip = NULL;
-  cm->above_seg_context = NULL;
   cm->last_frame_seg_map = NULL;
   cm->mi_grid_base = NULL;
   cm->prev_mi_grid_base = NULL;
 }
 
 static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
-  cm->mb_cols = (aligned_width + 8) >> 4;
-  cm->mb_rows = (aligned_height + 8) >> 4;
-  cm->MBs = cm->mb_rows * cm->mb_cols;
-
   cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
   cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
   cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE;
+
+  cm->mb_cols = (cm->mi_cols + 1) >> 1;
+  cm->mb_rows = (cm->mi_rows + 1) >> 1;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
 }
 
 static void setup_mi(VP9_COMMON *cm) {
@@ -85,7 +80,7 @@ static void setup_mi(VP9_COMMON *cm) {
 }
 
 int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
-  int i, mi_cols;
+  int i;
 
   const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
   const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
@@ -140,21 +135,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
 
   setup_mi(cm);
 
-  // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
-  // information is exposed at this level
-  mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-
-  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
-  // block where mi unit size is 8x8.
-  cm->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE *
-                                         (2 * mi_cols), 1);
-  if (!cm->above_context[0])
-    goto fail;
-
-  cm->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
-  if (!cm->above_seg_context)
-    goto fail;
-
   // Create the segmentation map structure and set to 0.
   cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
   if (!cm->last_frame_seg_map)
@@ -170,13 +150,8 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
 void vp9_create_common(VP9_COMMON *cm) {
   vp9_machine_specific_config(cm);
 
-  vp9_init_mbmode_probs(cm);
-
   cm->tx_mode = ONLY_4X4;
   cm->comp_pred_mode = HYBRID_PREDICTION;
-
-  // Initialize reference frame sign bias structure to defaults
-  vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
 }
 
 void vp9_remove_common(VP9_COMMON *cm) {
@@ -184,24 +159,19 @@ void vp9_remove_common(VP9_COMMON *cm) {
 }
 
 void vp9_initialize_common() {
+  vp9_init_neighbors();
   vp9_coef_tree_initialize();
   vp9_entropy_mode_init();
   vp9_entropy_mv_init();
 }
 
 void vp9_update_frame_size(VP9_COMMON *cm) {
-  int i, mi_cols;
   const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, MI_SIZE_LOG2);
   const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, MI_SIZE_LOG2);
 
   set_mb_mi(cm, aligned_width, aligned_height);
   setup_mi(cm);
 
-  mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-  for (i = 1; i < MAX_MB_PLANE; i++)
-    cm->above_context[i] =
-        cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
-
   // Initialize the previous frame segment map to 0.
   if (cm->last_frame_seg_map)
     vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h
index c8d677f..d0d4852 100644
--- a/libvpx/vp9/common/vp9_blockd.h
+++ b/libvpx/vp9/common/vp9_blockd.h
@@ -20,6 +20,7 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_common_data.h"
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_seg_common.h"
@@ -52,18 +53,10 @@ static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
 typedef enum {
   KEY_FRAME = 0,
   INTER_FRAME = 1,
-  NUM_FRAME_TYPES,
+  FRAME_TYPES,
 } FRAME_TYPE;
 
 typedef enum {
-  EIGHTTAP = 0,
-  EIGHTTAP_SMOOTH = 1,
-  EIGHTTAP_SHARP = 2,
-  BILINEAR = 3,
-  SWITCHABLE = 4  /* should be the last one */
-} INTERPOLATIONFILTERTYPE;
-
-typedef enum {
   DC_PRED,         // Average of above and left pixels
   V_PRED,          // Vertical
   H_PRED,          // Horizontal
@@ -81,10 +74,6 @@ typedef enum {
   MB_MODE_COUNT
 } MB_PREDICTION_MODE;
 
-static INLINE int is_intra_mode(MB_PREDICTION_MODE mode) {
-  return mode <= TM_PRED;
-}
-
 static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
   return mode >= NEARESTMV && mode <= NEWMV;
 }
@@ -101,10 +90,10 @@ static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
 
-union b_mode_info {
+typedef struct {
   MB_PREDICTION_MODE as_mode;
   int_mv as_mv[2];  // first, second inter predictor motion vectors
-};
+} b_mode_info;
 
 typedef enum {
   NONE = -1,
@@ -137,7 +126,7 @@ typedef struct {
   TX_SIZE tx_size;
   int_mv mv[2];                // for each reference frame used
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-  int_mv best_mv, best_second_mv;
+  int_mv best_mv[2];
 
   uint8_t mode_context[MAX_REF_FRAMES];
 
@@ -147,14 +136,14 @@ typedef struct {
   // Flags used for prediction status of various bit-stream signals
   unsigned char seg_id_predicted;
 
-  INTERPOLATIONFILTERTYPE interp_filter;
+  INTERPOLATION_TYPE interp_filter;
 
   BLOCK_SIZE sb_type;
 } MB_MODE_INFO;
 
 typedef struct {
   MB_MODE_INFO mbmi;
-  union b_mode_info bmi[4];
+  b_mode_info bmi[4];
 } MODE_INFO;
 
 static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
@@ -203,22 +192,15 @@ typedef struct macroblockd {
   struct scale_factors scale_factor[2];
 
   MODE_INFO *last_mi;
-  MODE_INFO *this_mi;
   int mode_info_stride;
 
-  MODE_INFO *mic_stream_ptr;
-
   // A NULL indicates that the 8x8 is not part of the image
   MODE_INFO **mi_8x8;
   MODE_INFO **prev_mi_8x8;
+  MODE_INFO *mi_stream;
 
   int up_available;
   int left_available;
-  int right_available;
-
-  // partition contexts
-  PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT *left_seg_context;
 
   /* Distance of MB away from frame edges */
   int mb_to_left_edge;
@@ -228,14 +210,10 @@ typedef struct macroblockd {
 
   int lossless;
   /* Inverse transform function pointers. */
-  void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
-  void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);
-  void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
+  void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
 
   struct subpix_fn_table  subpix;
 
-  int allow_high_precision_mv;
-
   int corrupted;
 
   unsigned char sb_index;   // index of 32x32 block inside the 64x64 block
@@ -245,71 +223,15 @@ typedef struct macroblockd {
 
   int q_index;
 
-} MACROBLOCKD;
+  /* Y,U,V,(A) */
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
 
-static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
-  switch (subsize) {
-    case BLOCK_64X64:
-    case BLOCK_64X32:
-    case BLOCK_32X64:
-    case BLOCK_32X32:
-      return &xd->sb_index;
-    case BLOCK_32X16:
-    case BLOCK_16X32:
-    case BLOCK_16X16:
-      return &xd->mb_index;
-    case BLOCK_16X8:
-    case BLOCK_8X16:
-    case BLOCK_8X8:
-      return &xd->b_index;
-    case BLOCK_8X4:
-    case BLOCK_4X8:
-    case BLOCK_4X4:
-      return &xd->ab_index;
-    default:
-      assert(0);
-      return NULL;
-  }
-}
-
-static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type,
-                                            BLOCK_SIZE sb_size) {
-  const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
-  const int bwl = b_width_log2(sb_type);
-  const int bhl = b_height_log2(sb_type);
-  const int boffset = b_width_log2(BLOCK_64X64) - bsl;
-  const char pcval0 = ~(0xe << boffset);
-  const char pcval1 = ~(0xf << boffset);
-  const char pcvalue[2] = {pcval0, pcval1};
-
-  assert(MAX(bwl, bhl) <= bsl);
-
-  // update the partition context at the end notes. set partition bits
-  // of block sizes larger than the current one to be one, and partition
-  // bits of smaller block sizes to be zero.
-  vpx_memset(xd->above_seg_context, pcvalue[bwl == bsl], bs);
-  vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
-}
-
-static INLINE int partition_plane_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type) {
-  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
-  int above = 0, left = 0, i;
-  int boffset = mi_width_log2(BLOCK_64X64) - bsl;
-
-  assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
-  assert(bsl >= 0);
-  assert(boffset >= 0);
-
-  for (i = 0; i < bs; i++)
-    above |= (xd->above_seg_context[i] & (1 << boffset));
-  for (i = 0; i < bs; i++)
-    left |= (xd->left_seg_context[i] & (1 << boffset));
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[8];
+} MACROBLOCKD;
 
-  above = (above > 0);
-  left  = (left > 0);
 
-  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
-}
 
 static BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) {
   const BLOCK_SIZE subsize = subsize_lookup[partition][bsize];
@@ -321,7 +243,7 @@ extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];
 
 static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                       const MACROBLOCKD *xd, int ib) {
-  const MODE_INFO *const mi = xd->this_mi;
+  const MODE_INFO *const mi = xd->mi_8x8[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (plane_type != PLANE_TYPE_Y_WITH_DC ||
@@ -336,13 +258,13 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
 static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
                                       const MACROBLOCKD *xd) {
   return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT;
+             mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT;
 }
 
 static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type,
                                         const MACROBLOCKD *xd) {
   return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT;
+             mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT;
 }
 
 static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {
@@ -391,7 +313,7 @@ static INLINE void foreach_transformed_block_in_plane(
     const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
     foreach_transformed_block_visitor visit, void *arg) {
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MB_MODE_INFO* mbmi = &xd->this_mi->mbmi;
+  const MB_MODE_INFO* mbmi = &xd->mi_8x8[0]->mbmi;
   // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
   // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
   // transform size varies per plane, look it up in a common way.
@@ -495,7 +417,7 @@ static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
   *y = (raster_mb >> tx_cols_log2) << tx_size;
 }
 
-static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
+static void extend_for_intra(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
                              int plane, int block, TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   uint8_t *const buf = pd->dst.buf;
@@ -520,19 +442,22 @@ static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
   }
 
   if (xd->mb_to_bottom_edge < 0) {
-    const int bh = 4 << b_height_log2(plane_bsize);
-    const int umv_border_start = bh + (xd->mb_to_bottom_edge >>
-                                       (3 + pd->subsampling_y));
-    int i;
-    const uint8_t c = buf[(umv_border_start - 1) * stride + x];
-    uint8_t *d = &buf[umv_border_start * stride + x];
-
-    if (y + bh > umv_border_start)
-      for (i = 0; i < bh; ++i, d += stride)
-        *d = c;
+    if (xd->left_available || x >= 0) {
+      const int bh = 4 << b_height_log2(plane_bsize);
+      const int umv_border_start =
+          bh + (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y));
+
+      if (y + bh > umv_border_start) {
+        const uint8_t c = buf[(umv_border_start - 1) * stride + x];
+        uint8_t *d = &buf[umv_border_start * stride + x];
+        int i;
+        for (i = 0; i < bh; ++i, d += stride)
+          *d = c;
+      }
+    }
   }
 }
-static void set_contexts_on_border(MACROBLOCKD *xd,
+static void set_contexts_on_border(const MACROBLOCKD *xd,
                                    struct macroblockd_plane *pd,
                                    BLOCK_SIZE plane_bsize,
                                    int tx_size_in_blocks, int has_eob,
@@ -570,7 +495,7 @@ static void set_contexts_on_border(MACROBLOCKD *xd,
     L[pt] = 0;
 }
 
-static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
+static void set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                          int has_eob, int aoff, int loff) {
   ENTROPY_CONTEXT *const A = pd->above_context + aoff;
@@ -586,7 +511,7 @@ static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
   }
 }
 
-static int get_tx_eob(struct segmentation *seg, int segment_id,
+static int get_tx_eob(const struct segmentation *seg, int segment_id,
                       TX_SIZE tx_size) {
   const int eob_max = 16 << (tx_size << 1);
   return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h
index 1796906..36d1cdf 100644
--- a/libvpx/vp9/common/vp9_common.h
+++ b/libvpx/vp9/common/vp9_common.h
@@ -40,8 +40,8 @@
     vpx_memcpy(dest, src, n * sizeof(*src)); \
   }
 
-#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest));
-#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest));
+#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest))
+#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest))
 
 static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255u : (val < 0) ? 0u : val;
@@ -84,9 +84,11 @@ static int get_unsigned_bits(unsigned int num_values) {
   } while (0)
 #endif
 
-#define SYNC_CODE_0 0x49
-#define SYNC_CODE_1 0x83
-#define SYNC_CODE_2 0x42
+#define VP9_SYNC_CODE_0 0x49
+#define VP9_SYNC_CODE_1 0x83
+#define VP9_SYNC_CODE_2 0x42
+
+#define VP9_FRAME_MARKER 0x2
 
 
 #endif  // VP9_COMMON_VP9_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c
index dc41efd..f858900 100644
--- a/libvpx/vp9/common/vp9_common_data.c
+++ b/libvpx/vp9/common/vp9_common_data.c
@@ -115,6 +115,16 @@ const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES] = {
   TX_16X16, TX_16X16, TX_16X16, TX_32X32
 };
 
+const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
+  TX_4X4,  // ONLY_4X4
+  TX_8X8,  // ALLOW_8X8
+  TX_16X16,  // ALLOW_16X16
+  TX_32X32,  // ALLOW_32X32
+  TX_32X32,  // TX_MODE_SELECT
+};
+
+
+
 const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
 //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
 //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
@@ -133,3 +143,4 @@ const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
   {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
 };
 
+
diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h
index 3822bfc..c1f6405 100644
--- a/libvpx/vp9/common/vp9_common_data.h
+++ b/libvpx/vp9/common/vp9_common_data.h
@@ -27,6 +27,7 @@ extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
 extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
 extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
 extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES];
+extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
 extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
 
-#endif    // VP9_COMMON_VP9_COMMON_DATA_H
+#endif  // VP9_COMMON_VP9_COMMON_DATA_H
diff --git a/libvpx/vp9/common/vp9_convolve.c b/libvpx/vp9/common/vp9_convolve.c
index 94231a1..a2d864c 100644
--- a/libvpx/vp9/common/vp9_convolve.c
+++ b/libvpx/vp9/common/vp9_convolve.c
@@ -7,13 +7,13 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vp9/common/vp9_convolve.h"
 
 #include <assert.h>
 
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_convolve.h"
 #include "vp9/common/vp9_filter.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
@@ -35,7 +35,7 @@ static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
 
   for (y = 0; y < h; ++y) {
     /* Initial phase offset */
-    int x_q4 = (filter_x0 - filter_x_base) / taps;
+    int x_q4 = (int)(filter_x0 - filter_x_base) / taps;
 
     for (x = 0; x < w; ++x) {
       /* Per-pixel src offset */
@@ -76,7 +76,7 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
 
   for (y = 0; y < h; ++y) {
     /* Initial phase offset */
-    int x_q4 = (filter_x0 - filter_x_base) / taps;
+    int x_q4 = (int)(filter_x0 - filter_x_base) / taps;
 
     for (x = 0; x < w; ++x) {
       /* Per-pixel src offset */
@@ -118,7 +118,7 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
 
   for (x = 0; x < w; ++x) {
     /* Initial phase offset */
-    int y_q4 = (filter_y0 - filter_y_base) / taps;
+    int y_q4 = (int)(filter_y0 - filter_y_base) / taps;
 
     for (y = 0; y < h; ++y) {
       /* Per-pixel src offset */
@@ -160,7 +160,7 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
 
   for (x = 0; x < w; ++x) {
     /* Initial phase offset */
-    int y_q4 = (filter_y0 - filter_y_base) / taps;
+    int y_q4 = (int)(filter_y0 - filter_y_base) / taps;
 
     for (y = 0; y < h; ++y) {
       /* Per-pixel src offset */
@@ -282,7 +282,7 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
   int r;
 
   for (r = h; r > 0; --r) {
-    memcpy(dst, src, w);
+    vpx_memcpy(dst, src, w);
     src += src_stride;
     dst += dst_stride;
   }
diff --git a/libvpx/vp9/common/vp9_convolve.h b/libvpx/vp9/common/vp9_convolve.h
index 13220e9..29d4990 100644
--- a/libvpx/vp9/common/vp9_convolve.h
+++ b/libvpx/vp9/common/vp9_convolve.h
@@ -7,23 +7,16 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VP9_COMMON_CONVOLVE_H_
-#define VP9_COMMON_CONVOLVE_H_
+#ifndef VP9_COMMON_VP9_CONVOLVE_H_
+#define VP9_COMMON_VP9_CONVOLVE_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-#define FILTER_BITS 7
-
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h);
 
-struct subpix_fn_table {
-  const int16_t (*filter_x)[8];
-  const int16_t (*filter_y)[8];
-};
-
-#endif  // VP9_COMMON_CONVOLVE_H_
+#endif  // VP9_COMMON_VP9_CONVOLVE_H_
diff --git a/libvpx/vp9/common/vp9_debugmodes.c b/libvpx/vp9/common/vp9_debugmodes.c
index 79f769e..355ac1a 100644
--- a/libvpx/vp9/common/vp9_debugmodes.c
+++ b/libvpx/vp9/common/vp9_debugmodes.c
@@ -63,9 +63,9 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
   print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
   print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
 
-  log_frame_info(cm, "Vectors ",mvs);
+  log_frame_info(cm, "Vectors ", mvs);
   for (mi_row = 0; mi_row < rows; mi_row++) {
-    fprintf(mvs,"V ");
+    fprintf(mvs, "V ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row,
                                mi_8x8[mi_index]->mbmi.mv[0].as_mv.col);
diff --git a/libvpx/vp9/common/vp9_default_coef_probs.h b/libvpx/vp9/common/vp9_default_coef_probs.h
index 185fced..3b512be 100644
--- a/libvpx/vp9/common/vp9_default_coef_probs.h
+++ b/libvpx/vp9/common/vp9_default_coef_probs.h
@@ -7,6 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
 */
+#ifndef VP9_COMMON_DEFAULT_COEF_PROBS_H_
+#define VP9_COMMON_DEFAULT_COEF_PROBS_H_
 
 /*Generated file, included by vp9_entropy.c*/
 static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
@@ -694,3 +696,4 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
   }
 };
 
+#endif  // VP9_COMMON_DEFAULT_COEF_PROBS_H_
diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c
index 32d9e0c..d3a867c 100644
--- a/libvpx/vp9/common/vp9_entropy.c
+++ b/libvpx/vp9/common/vp9_entropy.c
@@ -52,156 +52,11 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
-  0,  4,  1,  5,
-  8,  2, 12,  9,
-  3,  6, 13, 10,
-  7, 14, 11, 15,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
-  0,  4,  8,  1,
-  12,  5,  9,  2,
-  13,  6, 10,  3,
-  7, 14, 11, 15,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
-  0,  1,  4,  2,
-  5,  3,  6,  8,
-  9,  7, 12, 10,
-  13, 11, 14, 15,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
-  0,  8,  1, 16,  9,  2, 17, 24,
-  10,  3, 18, 25, 32, 11,  4, 26,
-  33, 19, 40, 12, 34, 27,  5, 41,
-  20, 48, 13, 35, 42, 28, 21,  6,
-  49, 56, 36, 43, 29,  7, 14, 50,
-  57, 44, 22, 37, 15, 51, 58, 30,
-  45, 23, 52, 59, 38, 31, 60, 53,
-  46, 39, 61, 54, 47, 62, 55, 63,
-};
 
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
-  0,  8, 16,  1, 24,  9, 32, 17,
-  2, 40, 25, 10, 33, 18, 48,  3,
-  26, 41, 11, 56, 19, 34,  4, 49,
-  27, 42, 12, 35, 20, 57, 50, 28,
-  5, 43, 13, 36, 58, 51, 21, 44,
-  6, 29, 59, 37, 14, 52, 22,  7,
-  45, 60, 30, 15, 38, 53, 23, 46,
-  31, 61, 39, 54, 47, 62, 55, 63,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
-  0,  1,  2,  8,  9,  3, 16, 10,
-  4, 17, 11, 24,  5, 18, 25, 12,
-  19, 26, 32,  6, 13, 20, 33, 27,
-  7, 34, 40, 21, 28, 41, 14, 35,
-  48, 42, 29, 36, 49, 22, 43, 15,
-  56, 37, 50, 44, 30, 57, 23, 51,
-  58, 45, 38, 52, 31, 59, 53, 46,
-  60, 39, 61, 47, 54, 55, 62, 63,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
-  0,  16,   1,  32,  17,   2,  48,  33,  18,   3,  64,  34,  49,  19,  65,  80,
-  50,   4,  35,  66,  20,  81,  96,  51,   5,  36,  82,  97,  67, 112,  21,  52,
-  98,  37,  83, 113,   6,  68, 128,  53,  22,  99, 114,  84,   7, 129,  38,  69,
-  100, 115, 144, 130,  85,  54,  23,   8, 145,  39,  70, 116, 101, 131, 160, 146,
-  55,  86,  24,  71, 132, 117, 161,  40,   9, 102, 147, 176, 162,  87,  56,  25,
-  133, 118, 177, 148,  72, 103,  41, 163,  10, 192, 178,  88,  57, 134, 149, 119,
-  26, 164,  73, 104, 193,  42, 179, 208,  11, 135,  89, 165, 120, 150,  58, 194,
-  180,  27,  74, 209, 105, 151, 136,  43,  90, 224, 166, 195, 181, 121, 210,  59,
-  12, 152, 106, 167, 196,  75, 137, 225, 211, 240, 182, 122,  91,  28, 197,  13,
-  226, 168, 183, 153,  44, 212, 138, 107, 241,  60,  29, 123, 198, 184, 227, 169,
-  242,  76, 213, 154,  45,  92,  14, 199, 139,  61, 228, 214, 170, 185, 243, 108,
-  77, 155,  30,  15, 200, 229, 124, 215, 244,  93,  46, 186, 171, 201, 109, 140,
-  230,  62, 216, 245,  31, 125,  78, 156, 231,  47, 187, 202, 217,  94, 246, 141,
-  63, 232, 172, 110, 247, 157,  79, 218, 203, 126, 233, 188, 248,  95, 173, 142,
-  219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251,
-  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
-  0,  16,  32,  48,   1,  64,  17,  80,  33,  96,  49,   2,  65, 112,  18,  81,
-  34, 128,  50,  97,   3,  66, 144,  19, 113,  35,  82, 160,  98,  51, 129,   4,
-  67, 176,  20, 114, 145,  83,  36,  99, 130,  52, 192,   5, 161,  68, 115,  21,
-  146,  84, 208, 177,  37, 131, 100,  53, 162, 224,  69,   6, 116, 193, 147,  85,
-  22, 240, 132,  38, 178, 101, 163,  54, 209, 117,  70,   7, 148, 194,  86, 179,
-  225,  23, 133,  39, 164,   8, 102, 210, 241,  55, 195, 118, 149,  71, 180,  24,
-  87, 226, 134, 165, 211,  40, 103,  56,  72, 150, 196, 242, 119,   9, 181, 227,
-  88, 166,  25, 135,  41, 104, 212,  57, 151, 197, 120,  73, 243, 182, 136, 167,
-  213,  89,  10, 228, 105, 152, 198,  26,  42, 121, 183, 244, 168,  58, 137, 229,
-  74, 214,  90, 153, 199, 184,  11, 106, 245,  27, 122, 230, 169,  43, 215,  59,
-  200, 138, 185, 246,  75,  12,  91, 154, 216, 231, 107,  28,  44, 201, 123, 170,
-  60, 247, 232,  76, 139,  13,  92, 217, 186, 248, 155, 108,  29, 124,  45, 202,
-  233, 171,  61,  14,  77, 140,  15, 249,  93,  30, 187, 156, 218,  46, 109, 125,
-  62, 172,  78, 203,  31, 141, 234,  94,  47, 188,  63, 157, 110, 250, 219,  79,
-  126, 204, 173, 142,  95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236,
-  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
-  0,   1,   2,  16,   3,  17,   4,  18,  32,   5,  33,  19,   6,  34,  48,  20,
-  49,   7,  35,  21,  50,  64,   8,  36,  65,  22,  51,  37,  80,   9,  66,  52,
-  23,  38,  81,  67,  10,  53,  24,  82,  68,  96,  39,  11,  54,  83,  97,  69,
-  25,  98,  84,  40, 112,  55,  12,  70,  99, 113,  85,  26,  41,  56, 114, 100,
-  13,  71, 128,  86,  27, 115, 101, 129,  42,  57,  72, 116,  14,  87, 130, 102,
-  144,  73, 131, 117,  28,  58,  15,  88,  43, 145, 103, 132, 146, 118,  74, 160,
-  89, 133, 104,  29,  59, 147, 119,  44, 161, 148,  90, 105, 134, 162, 120, 176,
-  75, 135, 149,  30,  60, 163, 177,  45, 121,  91, 106, 164, 178, 150, 192, 136,
-  165, 179,  31, 151, 193,  76, 122,  61, 137, 194, 107, 152, 180, 208,  46, 166,
-  167, 195,  92, 181, 138, 209, 123, 153, 224, 196,  77, 168, 210, 182, 240, 108,
-  197,  62, 154, 225, 183, 169, 211,  47, 139,  93, 184, 226, 212, 241, 198, 170,
-  124, 155, 199,  78, 213, 185, 109, 227, 200,  63, 228, 242, 140, 214, 171, 186,
-  156, 229, 243, 125,  94, 201, 244, 215, 216, 230, 141, 187, 202,  79, 172, 110,
-  157, 245, 217, 231,  95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158,
-  188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175,
-  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
-  0,   32,    1,   64,   33,    2,   96,   65,   34,  128,    3,   97,   66,  160,  129,   35,   98,    4,   67,  130,  161,  192,   36,   99,  224,    5,  162,  193,   68,  131,   37,  100,
-  225,  194,  256,  163,   69,  132,    6,  226,  257,  288,  195,  101,  164,   38,  258,    7,  227,  289,  133,  320,   70,  196,  165,  290,  259,  228,   39,  321,  102,  352,    8,  197,
-  71,  134,  322,  291,  260,  353,  384,  229,  166,  103,   40,  354,  323,  292,  135,  385,  198,  261,   72,    9,  416,  167,  386,  355,  230,  324,  104,  293,   41,  417,  199,  136,
-  262,  387,  448,  325,  356,   10,   73,  418,  231,  168,  449,  294,  388,  105,  419,  263,   42,  200,  357,  450,  137,  480,   74,  326,  232,   11,  389,  169,  295,  420,  106,  451,
-  481,  358,  264,  327,  201,   43,  138,  512,  482,  390,  296,  233,  170,  421,   75,  452,  359,   12,  513,  265,  483,  328,  107,  202,  514,  544,  422,  391,  453,  139,   44,  234,
-  484,  297,  360,  171,   76,  515,  545,  266,  329,  454,   13,  423,  203,  108,  546,  485,  576,  298,  235,  140,  361,  330,  172,  547,   45,  455,  267,  577,  486,   77,  204,  362,
-  608,   14,  299,  578,  109,  236,  487,  609,  331,  141,  579,   46,   15,  173,  610,  363,   78,  205,   16,  110,  237,  611,  142,   47,  174,   79,  206,   17,  111,  238,   48,  143,
-  80,  175,  112,  207,   49,   18,  239,   81,  113,   19,   50,   82,  114,   51,   83,  115,  640,  516,  392,  268,  144,   20,  672,  641,  548,  517,  424,  393,  300,  269,  176,  145,
-  52,   21,  704,  673,  642,  580,  549,  518,  456,  425,  394,  332,  301,  270,  208,  177,  146,   84,   53,   22,  736,  705,  674,  643,  612,  581,  550,  519,  488,  457,  426,  395,
-  364,  333,  302,  271,  240,  209,  178,  147,  116,   85,   54,   23,  737,  706,  675,  613,  582,  551,  489,  458,  427,  365,  334,  303,  241,  210,  179,  117,   86,   55,  738,  707,
-  614,  583,  490,  459,  366,  335,  242,  211,  118,   87,  739,  615,  491,  367,  243,  119,  768,  644,  520,  396,  272,  148,   24,  800,  769,  676,  645,  552,  521,  428,  397,  304,
-  273,  180,  149,   56,   25,  832,  801,  770,  708,  677,  646,  584,  553,  522,  460,  429,  398,  336,  305,  274,  212,  181,  150,   88,   57,   26,  864,  833,  802,  771,  740,  709,
-  678,  647,  616,  585,  554,  523,  492,  461,  430,  399,  368,  337,  306,  275,  244,  213,  182,  151,  120,   89,   58,   27,  865,  834,  803,  741,  710,  679,  617,  586,  555,  493,
-  462,  431,  369,  338,  307,  245,  214,  183,  121,   90,   59,  866,  835,  742,  711,  618,  587,  494,  463,  370,  339,  246,  215,  122,   91,  867,  743,  619,  495,  371,  247,  123,
-  896,  772,  648,  524,  400,  276,  152,   28,  928,  897,  804,  773,  680,  649,  556,  525,  432,  401,  308,  277,  184,  153,   60,   29,  960,  929,  898,  836,  805,  774,  712,  681,
-  650,  588,  557,  526,  464,  433,  402,  340,  309,  278,  216,  185,  154,   92,   61,   30,  992,  961,  930,  899,  868,  837,  806,  775,  744,  713,  682,  651,  620,  589,  558,  527,
-  496,  465,  434,  403,  372,  341,  310,  279,  248,  217,  186,  155,  124,   93,   62,   31,  993,  962,  931,  869,  838,  807,  745,  714,  683,  621,  590,  559,  497,  466,  435,  373,
-  342,  311,  249,  218,  187,  125,   94,   63,  994,  963,  870,  839,  746,  715,  622,  591,  498,  467,  374,  343,  250,  219,  126,   95,  995,  871,  747,  623,  499,  375,  251,  127,
-  900,  776,  652,  528,  404,  280,  156,  932,  901,  808,  777,  684,  653,  560,  529,  436,  405,  312,  281,  188,  157,  964,  933,  902,  840,  809,  778,  716,  685,  654,  592,  561,
-  530,  468,  437,  406,  344,  313,  282,  220,  189,  158,  996,  965,  934,  903,  872,  841,  810,  779,  748,  717,  686,  655,  624,  593,  562,  531,  500,  469,  438,  407,  376,  345,
-  314,  283,  252,  221,  190,  159,  997,  966,  935,  873,  842,  811,  749,  718,  687,  625,  594,  563,  501,  470,  439,  377,  346,  315,  253,  222,  191,  998,  967,  874,  843,  750,
-  719,  626,  595,  502,  471,  378,  347,  254,  223,  999,  875,  751,  627,  503,  379,  255,  904,  780,  656,  532,  408,  284,  936,  905,  812,  781,  688,  657,  564,  533,  440,  409,
-  316,  285,  968,  937,  906,  844,  813,  782,  720,  689,  658,  596,  565,  534,  472,  441,  410,  348,  317,  286, 1000,  969,  938,  907,  876,  845,  814,  783,  752,  721,  690,  659,
-  628,  597,  566,  535,  504,  473,  442,  411,  380,  349,  318,  287, 1001,  970,  939,  877,  846,  815,  753,  722,  691,  629,  598,  567,  505,  474,  443,  381,  350,  319, 1002,  971,
-  878,  847,  754,  723,  630,  599,  506,  475,  382,  351, 1003,  879,  755,  631,  507,  383,  908,  784,  660,  536,  412,  940,  909,  816,  785,  692,  661,  568,  537,  444,  413,  972,
-  941,  910,  848,  817,  786,  724,  693,  662,  600,  569,  538,  476,  445,  414, 1004,  973,  942,  911,  880,  849,  818,  787,  756,  725,  694,  663,  632,  601,  570,  539,  508,  477,
-  446,  415, 1005,  974,  943,  881,  850,  819,  757,  726,  695,  633,  602,  571,  509,  478,  447, 1006,  975,  882,  851,  758,  727,  634,  603,  510,  479, 1007,  883,  759,  635,  511,
-  912,  788,  664,  540,  944,  913,  820,  789,  696,  665,  572,  541,  976,  945,  914,  852,  821,  790,  728,  697,  666,  604,  573,  542, 1008,  977,  946,  915,  884,  853,  822,  791,
-  760,  729,  698,  667,  636,  605,  574,  543, 1009,  978,  947,  885,  854,  823,  761,  730,  699,  637,  606,  575, 1010,  979,  886,  855,  762,  731,  638,  607, 1011,  887,  763,  639,
-  916,  792,  668,  948,  917,  824,  793,  700,  669,  980,  949,  918,  856,  825,  794,  732,  701,  670, 1012,  981,  950,  919,  888,  857,  826,  795,  764,  733,  702,  671, 1013,  982,
-  951,  889,  858,  827,  765,  734,  703, 1014,  983,  890,  859,  766,  735, 1015,  891,  767,  920,  796,  952,  921,  828,  797,  984,  953,  922,  860,  829,  798, 1016,  985,  954,  923,
-  892,  861,  830,  799, 1017,  986,  955,  893,  862,  831, 1018,  987,  894,  863, 1019,  895,  924,  956,  925,  988,  957,  926, 1020,  989,  958,  927, 1021,  990,  959, 1022,  991, 1023,
-};
 
 /* Array indices are identical to previously-existing CONTEXT_NODE indices */
 
-const vp9_tree_index vp9_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
-{
+const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
   -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
   -ZERO_TOKEN, 4,                             /* 1 = ZERO */
   -ONE_TOKEN, 6,                              /* 2 = ONE */
@@ -419,7 +274,7 @@ static void init_bit_trees() {
   init_bit_tree(cat6, 14);
 }
 
-const vp9_extra_bit vp9_extra_bits[12] = {
+const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = {
   { 0, 0, 0, 0},
   { 0, 0, 0, 1},
   { 0, 0, 0, 2},
@@ -443,159 +298,7 @@ void vp9_default_coef_probs(VP9_COMMON *cm) {
   vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
-// Neighborhood 5-tuples for various scans and blocksizes,
-// in {top, left, topleft, topright, bottomleft} order
-// for each position in raster scan order.
-// -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
-
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
-
-static int find_in_scan(const int16_t *scan, int l, int idx) {
-  int n, l2 = l * l;
-  for (n = 0; n < l2; n++) {
-    int rc = scan[n];
-    if (rc == idx)
-      return  n;
-  }
-  assert(0);
-  return -1;
-}
-static void init_scan_neighbors(const int16_t *scan,
-                                int16_t *iscan,
-                                int l, int16_t *neighbors) {
-  int l2 = l * l;
-  int n, i, j;
-
-  // dc doesn't use this type of prediction
-  neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
-  neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
-  iscan[0] = find_in_scan(scan, l, 0);
-  for (n = 1; n < l2; n++) {
-    int rc = scan[n];
-    iscan[n] = find_in_scan(scan, l, n);
-    i = rc / l;
-    j = rc % l;
-    if (i > 0 && j > 0) {
-      // col/row scan is used for adst/dct, and generally means that
-      // energy decreases to zero much faster in the dimension in
-      // which ADST is used compared to the direction in which DCT
-      // is used. Likewise, we find much higher correlation between
-      // coefficients within the direction in which DCT is used.
-      // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
-      // as a context. If ADST or DCT is used in both directions, we
-      // use the combination of the two as a context.
-      int a = (i - 1) * l + j;
-      int b =  i      * l + j - 1;
-      if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
-          scan == vp9_col_scan_16x16) {
-        // in the col/row scan cases (as well as left/top edge cases), we set
-        // both contexts to the same value, so we can branchlessly do a+b+1>>1
-        // which automatically becomes a if a == b
-        neighbors[MAX_NEIGHBORS * n + 0] =
-        neighbors[MAX_NEIGHBORS * n + 1] = a;
-      } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
-                 scan == vp9_row_scan_16x16) {
-        neighbors[MAX_NEIGHBORS * n + 0] =
-        neighbors[MAX_NEIGHBORS * n + 1] = b;
-      } else {
-        neighbors[MAX_NEIGHBORS * n + 0] = a;
-        neighbors[MAX_NEIGHBORS * n + 1] = b;
-      }
-    } else if (i > 0) {
-      neighbors[MAX_NEIGHBORS * n + 0] =
-      neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
-    } else {
-      assert(j > 0);
-      neighbors[MAX_NEIGHBORS * n + 0] =
-      neighbors[MAX_NEIGHBORS * n + 1] =  i      * l + j - 1;
-    }
-    assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
-  }
-  // one padding item so we don't have to add branches in code to handle
-  // calls to get_coef_context() for the token after the final dc token
-  neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
-  neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
-}
-
-void vp9_init_neighbors() {
-  init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
-                      vp9_default_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
-                      vp9_row_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
-                      vp9_col_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
-                      vp9_default_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
-                      vp9_row_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
-                      vp9_col_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
-                      vp9_default_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
-                      vp9_row_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
-                      vp9_col_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
-                      vp9_default_scan_32x32_neighbors);
-}
-
-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) {
-  if (scan == vp9_default_scan_4x4) {
-    return vp9_default_scan_4x4_neighbors;
-  } else if (scan == vp9_row_scan_4x4) {
-    return vp9_row_scan_4x4_neighbors;
-  } else if (scan == vp9_col_scan_4x4) {
-    return vp9_col_scan_4x4_neighbors;
-  } else if (scan == vp9_default_scan_8x8) {
-    return vp9_default_scan_8x8_neighbors;
-  } else if (scan == vp9_row_scan_8x8) {
-    return vp9_row_scan_8x8_neighbors;
-  } else if (scan == vp9_col_scan_8x8) {
-    return vp9_col_scan_8x8_neighbors;
-  } else if (scan == vp9_default_scan_16x16) {
-    return vp9_default_scan_16x16_neighbors;
-  } else if (scan == vp9_row_scan_16x16) {
-    return vp9_row_scan_16x16_neighbors;
-  } else if (scan == vp9_col_scan_16x16) {
-    return vp9_col_scan_16x16_neighbors;
-  } else {
-    assert(scan == vp9_default_scan_32x32);
-    return vp9_default_scan_32x32_neighbors;
-  }
-}
-
 void vp9_coef_tree_initialize() {
-  vp9_init_neighbors();
   init_bit_trees();
   vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
 }
@@ -612,16 +315,15 @@ void vp9_coef_tree_initialize() {
 static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
                              unsigned int count_sat,
                              unsigned int update_factor) {
-  FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
 
   vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size];
-  vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
+  const vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
   vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size];
   unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
       cm->counts.eob_branch[tx_size];
-  int t, i, j, k, l;
+  int i, j, k, l, m;
   unsigned int branch_ct[UNCONSTRAINED_NODES][2];
-  vp9_prob coef_probs[UNCONSTRAINED_NODES];
 
   for (i = 0; i < BLOCK_TYPES; ++i)
     for (j = 0; j < REF_TYPES; ++j)
@@ -629,15 +331,14 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
           if (l >= 3 && k == 0)
             continue;
-          vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs,
-                                           branch_ct, coef_counts[i][j][k][l],
-                                           0);
+          vp9_tree_probs_from_distribution(vp9_coefmodel_tree, branch_ct,
+                                           coef_counts[i][j][k][l], 0);
           branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
-          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          for (t = 0; t < UNCONSTRAINED_NODES; ++t)
-            dst_coef_probs[i][j][k][l][t] = merge_probs(
-                pre_coef_probs[i][j][k][l][t], coef_probs[t],
-                branch_ct[t], count_sat, update_factor);
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            dst_coef_probs[i][j][k][l][m] = merge_probs(
+                                                pre_coef_probs[i][j][k][l][m],
+                                                branch_ct[m],
+                                                count_sat, update_factor);
         }
 }
 
@@ -645,7 +346,7 @@ void vp9_adapt_coef_probs(VP9_COMMON *cm) {
   TX_SIZE t;
   unsigned int count_sat, update_factor;
 
-  if (cm->frame_type == KEY_FRAME || cm->intra_only) {
+  if (frame_is_intra_only(cm)) {
     update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
     count_sat = COEF_COUNT_SAT_KEY;
   } else if (cm->last_frame_type == KEY_FRAME) {
diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h
index f138c09..c58e852 100644
--- a/libvpx/vp9/common/vp9_entropy.h
+++ b/libvpx/vp9/common/vp9_entropy.h
@@ -12,9 +12,13 @@
 #define VP9_COMMON_VP9_ENTROPY_H_
 
 #include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_treecoder.h"
+
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/common/vp9_treecoder.h"
+
+#define DIFF_UPDATE_PROB 252
 
 /* Coefficient token alphabet */
 
@@ -36,7 +40,10 @@
 
 #define INTER_MODE_CONTEXTS     7
 
-extern const vp9_tree_index vp9_coef_tree[];
+extern DECLARE_ALIGNED(16, const uint8_t,
+                       vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
+extern const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)];
 
 #define DCT_EOB_MODEL_TOKEN     3      /* EOB       Extra Bits 0+0 */
 extern const vp9_tree_index vp9_coefmodel_tree[];
@@ -44,13 +51,14 @@ extern const vp9_tree_index vp9_coefmodel_tree[];
 extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
 
 typedef struct {
-  vp9_tree_p tree;
+  vp9_tree_index *tree;
   const vp9_prob *prob;
   int len;
   int base_val;
 } vp9_extra_bit;
 
-extern const vp9_extra_bit vp9_extra_bits[12];    /* indexed by token value */
+// indexed by token value
+extern const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS];
 
 #define MAX_PROB                255
 #define DCT_MAX_VALUE           16384
@@ -88,72 +96,14 @@ typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                                     [MAX_ENTROPY_TOKENS];
 typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                                     [ENTROPY_NODES][2];
-typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                                [ENTROPY_NODES];
 
 #define SUBEXP_PARAM                4   /* Subexponential code parameter */
 #define MODULUS_PARAM               13  /* Modulus parameter */
 
 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *cm);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
 
-#define MAX_NEIGHBORS 2
-
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
-
-void vp9_coef_tree_initialize(void);
+void vp9_coef_tree_initialize();
 void vp9_adapt_coef_probs(struct VP9Common *cm);
 
 static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
@@ -183,16 +133,6 @@ static int get_coef_band(const uint8_t * band_translate, int coef_index) {
     ? (COEF_BANDS-1) : band_translate[coef_index];
 }
 
-static INLINE int get_coef_context(const int16_t *neighbors,
-                                   uint8_t *token_cache,
-                                   int c) {
-  return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
-          token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
-}
-
-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan);
-
-
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
@@ -210,171 +150,62 @@ typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
 typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
                                           [PREV_COEF_CONTEXTS]
                                           [UNCONSTRAINED_NODES + 1];
-typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS]
-                                          [PREV_COEF_CONTEXTS]
-                                          [UNCONSTRAINED_NODES][2];
 
 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
 
-static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_scan_4x4;
-    case DCT_ADST:
-      return vp9_col_scan_4x4;
-    default:
-      return vp9_default_scan_4x4;
-  }
-}
+static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+                                                const ENTROPY_CONTEXT *l) {
+  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
 
-static INLINE void get_scan_nb_4x4(TX_TYPE tx_type,
-                                   const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_4x4;
-      *nb = vp9_row_scan_4x4_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_4x4;
-      *nb = vp9_col_scan_4x4_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_4x4;
-      *nb = vp9_default_scan_4x4_neighbors;
+  switch (tx_size) {
+    case TX_4X4:
+      above_ec = a[0] != 0;
+      left_ec = l[0] != 0;
       break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_4x4;
-    case DCT_ADST:
-      return vp9_col_iscan_4x4;
-    default:
-      return vp9_default_iscan_4x4;
-  }
-}
-
-static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_scan_8x8;
-    case DCT_ADST:
-      return vp9_col_scan_8x8;
-    default:
-      return vp9_default_scan_8x8;
-  }
-}
-
-static INLINE void get_scan_nb_8x8(TX_TYPE tx_type,
-                                   const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_8x8;
-      *nb = vp9_row_scan_8x8_neighbors;
+    case TX_8X8:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec  = !!*(const uint16_t *)l;
       break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_8x8;
-      *nb = vp9_col_scan_8x8_neighbors;
+    case TX_16X16:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec  = !!*(const uint32_t *)l;
       break;
-    default:
-      *scan = vp9_default_scan_8x8;
-      *nb = vp9_default_scan_8x8_neighbors;
+    case TX_32X32:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec  = !!*(const uint64_t *)l;
       break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_8x8;
-    case DCT_ADST:
-      return vp9_col_iscan_8x8;
     default:
-      return vp9_default_iscan_8x8;
-  }
-}
-
-static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_scan_16x16;
-    case DCT_ADST:
-      return vp9_col_scan_16x16;
-    default:
-      return vp9_default_scan_16x16;
+      assert(!"Invalid transform size.");
   }
-}
 
-static INLINE void get_scan_nb_16x16(TX_TYPE tx_type,
-                                     const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_16x16;
-      *nb = vp9_row_scan_16x16_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_16x16;
-      *nb = vp9_col_scan_16x16_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_16x16;
-      *nb = vp9_default_scan_16x16_neighbors;
-      break;
-  }
+  return combine_entropy_contexts(above_ec, left_ec);
 }
 
-static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_16x16;
-    case DCT_ADST:
-      return vp9_col_iscan_16x16;
-    default:
-      return vp9_default_iscan_16x16;
-  }
+static const uint8_t *get_band_translate(TX_SIZE tx_size) {
+  return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
+                           : vp9_coefband_trans_8x8plus;
 }
 
-static int get_entropy_context(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                               PLANE_TYPE type, int block_idx,
-                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
-                               const int16_t **scan,
-                               const uint8_t **band_translate) {
-  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
-
+static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                     PLANE_TYPE type, int block_idx,
+                     const int16_t **scan, const int16_t **scan_nb) {
   switch (tx_size) {
     case TX_4X4:
-      *scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
-      *band_translate = vp9_coefband_trans_4x4;
-      above_ec = A[0] != 0;
-      left_ec = L[0] != 0;
+      get_scan_nb_4x4(get_tx_type_4x4(type, xd, block_idx), scan, scan_nb);
       break;
     case TX_8X8:
-      *scan = get_scan_8x8(get_tx_type_8x8(type, xd));
-      *band_translate = vp9_coefband_trans_8x8plus;
-      above_ec = !!*(uint16_t *)A;
-      left_ec  = !!*(uint16_t *)L;
+      get_scan_nb_8x8(get_tx_type_8x8(type, xd), scan, scan_nb);
       break;
     case TX_16X16:
-      *scan = get_scan_16x16(get_tx_type_16x16(type, xd));
-      *band_translate = vp9_coefband_trans_8x8plus;
-      above_ec = !!*(uint32_t *)A;
-      left_ec  = !!*(uint32_t *)L;
+      get_scan_nb_16x16(get_tx_type_16x16(type, xd), scan, scan_nb);
       break;
     case TX_32X32:
       *scan = vp9_default_scan_32x32;
-      *band_translate = vp9_coefband_trans_8x8plus;
-      above_ec = !!*(uint64_t *)A;
-      left_ec  = !!*(uint64_t *)L;
+      *scan_nb = vp9_default_scan_32x32_neighbors;
       break;
     default:
       assert(!"Invalid transform size.");
   }
-
-  return combine_entropy_contexts(above_ec, left_ec);
 }
 
-enum { VP9_COEF_UPDATE_PROB = 252 };
-
 #endif  // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c
index 93c89b0..a963d55 100644
--- a/libvpx/vp9/common/vp9_entropymode.c
+++ b/libvpx/vp9/common/vp9_entropymode.c
@@ -14,204 +14,199 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"
 
-const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES]
-                                  [INTRA_MODES - 1] = {
-  { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,
-  { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,
-  { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,
-  { 120,  11,  50, 123, 163, 135,  64,  77, 103 } /* y = d45 */,
-  { 113,   9,  36, 155, 111, 157,  32,  44, 161 } /* y = d135 */,
-  { 116,   9,  55, 176,  76,  96,  37,  61, 149 } /* y = d117 */,
-  { 115,   9,  28, 141, 161, 167,  21,  25, 193 } /* y = d153 */,
-  { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d207 */,
-  { 116,  12,  64, 120, 140, 125,  49, 115, 121 } /* y = d63 */,
-  { 102,  19,  66, 162, 182, 122,  35,  59, 128 } /* y = tm */
+const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
+  {  // above = dc
+    { 137,  30,  42, 148, 151, 207,  70,  52,  91 },  // left = dc
+    {  92,  45, 102, 136, 116, 180,  74,  90, 100 },  // left = v
+    {  73,  32,  19, 187, 222, 215,  46,  34, 100 },  // left = h
+    {  91,  30,  32, 116, 121, 186,  93,  86,  94 },  // left = d45
+    {  72,  35,  36, 149,  68, 206,  68,  63, 105 },  // left = d135
+    {  73,  31,  28, 138,  57, 124,  55, 122, 151 },  // left = d117
+    {  67,  23,  21, 140, 126, 197,  40,  37, 171 },  // left = d153
+    {  86,  27,  28, 128, 154, 212,  45,  43,  53 },  // left = d207
+    {  74,  32,  27, 107,  86, 160,  63, 134, 102 },  // left = d63
+    {  59,  67,  44, 140, 161, 202,  78,  67, 119 }   // left = tm
+  }, {  // above = v
+    {  63,  36, 126, 146, 123, 158,  60,  90,  96 },  // left = dc
+    {  43,  46, 168, 134, 107, 128,  69, 142,  92 },  // left = v
+    {  44,  29,  68, 159, 201, 177,  50,  57,  77 },  // left = h
+    {  58,  38,  76, 114,  97, 172,  78, 133,  92 },  // left = d45
+    {  46,  41,  76, 140,  63, 184,  69, 112,  57 },  // left = d135
+    {  38,  32,  85, 140,  46, 112,  54, 151, 133 },  // left = d117
+    {  39,  27,  61, 131, 110, 175,  44,  75, 136 },  // left = d153
+    {  52,  30,  74, 113, 130, 175,  51,  64,  58 },  // left = d207
+    {  47,  35,  80, 100,  74, 143,  64, 163,  74 },  // left = d63
+    {  36,  61, 116, 114, 128, 162,  80, 125,  82 }   // left = tm
+  }, {  // above = h
+    {  82,  26,  26, 171, 208, 204,  44,  32, 105 },  // left = dc
+    {  55,  44,  68, 166, 179, 192,  57,  57, 108 },  // left = v
+    {  42,  26,  11, 199, 241, 228,  23,  15,  85 },  // left = h
+    {  68,  42,  19, 131, 160, 199,  55,  52,  83 },  // left = d45
+    {  58,  50,  25, 139, 115, 232,  39,  52, 118 },  // left = d135
+    {  50,  35,  33, 153, 104, 162,  64,  59, 131 },  // left = d117
+    {  44,  24,  16, 150, 177, 202,  33,  19, 156 },  // left = d153
+    {  55,  27,  12, 153, 203, 218,  26,  27,  49 },  // left = d207
+    {  53,  49,  21, 110, 116, 168,  59,  80,  76 },  // left = d63
+    {  38,  72,  19, 168, 203, 212,  50,  50, 107 }   // left = tm
+  }, {  // above = d45
+    { 103,  26,  36, 129, 132, 201,  83,  80,  93 },  // left = dc
+    {  59,  38,  83, 112, 103, 162,  98, 136,  90 },  // left = v
+    {  62,  30,  23, 158, 200, 207,  59,  57,  50 },  // left = h
+    {  67,  30,  29,  84,  86, 191, 102,  91,  59 },  // left = d45
+    {  60,  32,  33, 112,  71, 220,  64,  89, 104 },  // left = d135
+    {  53,  26,  34, 130,  56, 149,  84, 120, 103 },  // left = d117
+    {  53,  21,  23, 133, 109, 210,  56,  77, 172 },  // left = d153
+    {  77,  19,  29, 112, 142, 228,  55,  66,  36 },  // left = d207
+    {  61,  29,  29,  93,  97, 165,  83, 175, 162 },  // left = d63
+    {  47,  47,  43, 114, 137, 181, 100,  99,  95 }   // left = tm
+  }, {  // above = d135
+    {  69,  23,  29, 128,  83, 199,  46,  44, 101 },  // left = dc
+    {  53,  40,  55, 139,  69, 183,  61,  80, 110 },  // left = v
+    {  40,  29,  19, 161, 180, 207,  43,  24,  91 },  // left = h
+    {  60,  34,  19, 105,  61, 198,  53,  64,  89 },  // left = d45
+    {  52,  31,  22, 158,  40, 209,  58,  62,  89 },  // left = d135
+    {  44,  31,  29, 147,  46, 158,  56, 102, 198 },  // left = d117
+    {  35,  19,  12, 135,  87, 209,  41,  45, 167 },  // left = d153
+    {  55,  25,  21, 118,  95, 215,  38,  39,  66 },  // left = d207
+    {  51,  38,  25, 113,  58, 164,  70,  93,  97 },  // left = d63
+    {  47,  54,  34, 146, 108, 203,  72, 103, 151 }   // left = tm
+  }, {  // above = d117
+    {  64,  19,  37, 156,  66, 138,  49,  95, 133 },  // left = dc
+    {  46,  27,  80, 150,  55, 124,  55, 121, 135 },  // left = v
+    {  36,  23,  27, 165, 149, 166,  54,  64, 118 },  // left = h
+    {  53,  21,  36, 131,  63, 163,  60, 109,  81 },  // left = d45
+    {  40,  26,  35, 154,  40, 185,  51,  97, 123 },  // left = d135
+    {  35,  19,  34, 179,  19,  97,  48, 129, 124 },  // left = d117
+    {  36,  20,  26, 136,  62, 164,  33,  77, 154 },  // left = d153
+    {  45,  18,  32, 130,  90, 157,  40,  79,  91 },  // left = d207
+    {  45,  26,  28, 129,  45, 129,  49, 147, 123 },  // left = d63
+    {  38,  44,  51, 136,  74, 162,  57,  97, 121 }   // left = tm
+  }, {  // above = d153
+    {  75,  17,  22, 136, 138, 185,  32,  34, 166 },  // left = dc
+    {  56,  39,  58, 133, 117, 173,  48,  53, 187 },  // left = v
+    {  35,  21,  12, 161, 212, 207,  20,  23, 145 },  // left = h
+    {  56,  29,  19, 117, 109, 181,  55,  68, 112 },  // left = d45
+    {  47,  29,  17, 153,  64, 220,  59,  51, 114 },  // left = d135
+    {  46,  16,  24, 136,  76, 147,  41,  64, 172 },  // left = d117
+    {  34,  17,  11, 108, 152, 187,  13,  15, 209 },  // left = d153
+    {  51,  24,  14, 115, 133, 209,  32,  26, 104 },  // left = d207
+    {  55,  30,  18, 122,  79, 179,  44,  88, 116 },  // left = d63
+    {  37,  49,  25, 129, 168, 164,  41,  54, 148 }   // left = tm
+  }, {  // above = d207
+    {  82,  22,  32, 127, 143, 213,  39,  41,  70 },  // left = dc
+    {  62,  44,  61, 123, 105, 189,  48,  57,  64 },  // left = v
+    {  47,  25,  17, 175, 222, 220,  24,  30,  86 },  // left = h
+    {  68,  36,  17, 106, 102, 206,  59,  74,  74 },  // left = d45
+    {  57,  39,  23, 151,  68, 216,  55,  63,  58 },  // left = d135
+    {  49,  30,  35, 141,  70, 168,  82,  40, 115 },  // left = d117
+    {  51,  25,  15, 136, 129, 202,  38,  35, 139 },  // left = d153
+    {  68,  26,  16, 111, 141, 215,  29,  28,  28 },  // left = d207
+    {  59,  39,  19, 114,  75, 180,  77, 104,  42 },  // left = d63
+    {  40,  61,  26, 126, 152, 206,  61,  59,  93 }   // left = tm
+  }, {  // above = d63
+    {  78,  23,  39, 111, 117, 170,  74, 124,  94 },  // left = dc
+    {  48,  34,  86, 101,  92, 146,  78, 179, 134 },  // left = v
+    {  47,  22,  24, 138, 187, 178,  68,  69,  59 },  // left = h
+    {  56,  25,  33, 105, 112, 187,  95, 177, 129 },  // left = d45
+    {  48,  31,  27, 114,  63, 183,  82, 116,  56 },  // left = d135
+    {  43,  28,  37, 121,  63, 123,  61, 192, 169 },  // left = d117
+    {  42,  17,  24, 109,  97, 177,  56,  76, 122 },  // left = d153
+    {  58,  18,  28, 105, 139, 182,  70,  92,  63 },  // left = d207
+    {  46,  23,  32,  74,  86, 150,  67, 183,  88 },  // left = d63
+    {  36,  38,  48,  92, 122, 165,  88, 137,  91 }   // left = tm
+  }, {  // above = tm
+    {  65,  70,  60, 155, 159, 199,  61,  60,  81 },  // left = dc
+    {  44,  78, 115, 132, 119, 173,  71, 112,  93 },  // left = v
+    {  39,  38,  21, 184, 227, 206,  42,  32,  64 },  // left = h
+    {  58,  47,  36, 124, 137, 193,  80,  82,  78 },  // left = d45
+    {  49,  50,  35, 144,  95, 205,  63,  78,  59 },  // left = d135
+    {  41,  53,  52, 148,  71, 142,  65, 128,  51 },  // left = d117
+    {  40,  36,  28, 143, 143, 202,  40,  55, 137 },  // left = d153
+    {  52,  34,  29, 129, 183, 227,  42,  35,  43 },  // left = d207
+    {  42,  44,  44, 104, 105, 164,  64, 130,  80 },  // left = d63
+    {  43,  81,  53, 140, 169, 204,  68,  84,  72 }   // left = tm
+  }
 };
 
-static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS]
-                                        [INTRA_MODES - 1] = {
-  {  65,  32,  18, 144, 162, 194,  41,  51,  98 } /* block_size < 8x8 */,
-  { 132,  68,  18, 165, 217, 196,  45,  40,  78 } /* block_size < 16x16 */,
-  { 173,  80,  19, 176, 240, 193,  64,  35,  46 } /* block_size < 32x32 */,
-  { 221, 135,  38, 194, 248, 121,  96,  85,  29 } /* block_size >= 32x32 */
+const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
+  { 144,  11,  54, 157, 195, 130,  46,  58, 108 },  // y = dc
+  { 118,  15, 123, 148, 131, 101,  44,  93, 131 },  // y = v
+  { 113,  12,  23, 188, 226, 142,  26,  32, 125 },  // y = h
+  { 120,  11,  50, 123, 163, 135,  64,  77, 103 },  // y = d45
+  { 113,   9,  36, 155, 111, 157,  32,  44, 161 },  // y = d135
+  { 116,   9,  55, 176,  76,  96,  37,  61, 149 },  // y = d117
+  { 115,   9,  28, 141, 161, 167,  21,  25, 193 },  // y = d153
+  { 120,  12,  32, 145, 195, 142,  32,  38,  86 },  // y = d207
+  { 116,  12,  64, 120, 140, 125,  49, 115, 121 },  // y = d63
+  { 102,  19,  66, 162, 182, 122,  35,  59, 128 }   // y = tm
 };
 
-static const vp9_prob default_if_uv_probs[INTRA_MODES]
-                                         [INTRA_MODES - 1] = {
-  { 120,   7,  76, 176, 208, 126,  28,  54, 103 } /* y = dc */,
-  {  48,  12, 154, 155, 139,  90,  34, 117, 119 } /* y = v */,
-  {  67,   6,  25, 204, 243, 158,  13,  21,  96 } /* y = h */,
-  {  97,   5,  44, 131, 176, 139,  48,  68,  97 } /* y = d45 */,
-  {  83,   5,  42, 156, 111, 152,  26,  49, 152 } /* y = d135 */,
-  {  80,   5,  58, 178,  74,  83,  33,  62, 145 } /* y = d117 */,
-  {  86,   5,  32, 154, 192, 168,  14,  22, 163 } /* y = d153 */,
-  {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d207 */,
-  {  77,   7,  64, 116, 132, 122,  37, 126, 120 } /* y = d63 */,
-  { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */
+static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
+  {  65,  32,  18, 144, 162, 194,  41,  51,  98 },  // block_size < 8x8
+  { 132,  68,  18, 165, 217, 196,  45,  40,  78 },  // block_size < 16x16
+  { 173,  80,  19, 176, 240, 193,  64,  35,  46 },  // block_size < 32x32
+  { 221, 135,  38, 194, 248, 121,  96,  85,  29 }   // block_size >= 32x32
 };
 
-static const vp9_prob default_partition_probs[NUM_FRAME_TYPES]
-                                             [NUM_PARTITION_CONTEXTS]
-                                             [PARTITION_TYPES - 1] = {
-  { /* frame_type = keyframe */
-    /* 8x8 -> 4x4 */
-    { 158,  97,  94 } /* a/l both not split */,
-    {  93,  24,  99 } /* a split, l not split */,
-    {  85, 119,  44 } /* l split, a not split */,
-    {  62,  59,  67 } /* a/l both split */,
-    /* 16x16 -> 8x8 */
-    { 149,  53,  53 } /* a/l both not split */,
-    {  94,  20,  48 } /* a split, l not split */,
-    {  83,  53,  24 } /* l split, a not split */,
-    {  52,  18,  18 } /* a/l both split */,
-    /* 32x32 -> 16x16 */
-    { 150,  40,  39 } /* a/l both not split */,
-    {  78,  12,  26 } /* a split, l not split */,
-    {  67,  33,  11 } /* l split, a not split */,
-    {  24,   7,   5 } /* a/l both split */,
-    /* 64x64 -> 32x32 */
-    { 174,  35,  49 } /* a/l both not split */,
-    {  68,  11,  27 } /* a split, l not split */,
-    {  57,  15,   9 } /* l split, a not split */,
-    {  12,   3,   3 } /* a/l both split */
-  }, { /* frame_type = interframe */
-    /* 8x8 -> 4x4 */
-    { 199, 122, 141 } /* a/l both not split */,
-    { 147,  63, 159 } /* a split, l not split */,
-    { 148, 133, 118 } /* l split, a not split */,
-    { 121, 104, 114 } /* a/l both split */,
-    /* 16x16 -> 8x8 */
-    { 174,  73,  87 } /* a/l both not split */,
-    {  92,  41,  83 } /* a split, l not split */,
-    {  82,  99,  50 } /* l split, a not split */,
-    {  53,  39,  39 } /* a/l both split */,
-    /* 32x32 -> 16x16 */
-    { 177,  58,  59 } /* a/l both not split */,
-    {  68,  26,  63 } /* a split, l not split */,
-    {  52,  79,  25 } /* l split, a not split */,
-    {  17,  14,  12 } /* a/l both split */,
-    /* 64x64 -> 32x32 */
-    { 222,  34,  30 } /* a/l both not split */,
-    {  72,  16,  44 } /* a split, l not split */,
-    {  58,  32,  12 } /* l split, a not split */,
-    {  10,   7,   6 } /* a/l both split */
-  }
+static const vp9_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
+  { 120,   7,  76, 176, 208, 126,  28,  54, 103 },  // y = dc
+  {  48,  12, 154, 155, 139,  90,  34, 117, 119 },  // y = v
+  {  67,   6,  25, 204, 243, 158,  13,  21,  96 },  // y = h
+  {  97,   5,  44, 131, 176, 139,  48,  68,  97 },  // y = d45
+  {  83,   5,  42, 156, 111, 152,  26,  49, 152 },  // y = d135
+  {  80,   5,  58, 178,  74,  83,  33,  62, 145 },  // y = d117
+  {  86,   5,  32, 154, 192, 168,  14,  22, 163 },  // y = d153
+  {  85,   5,  32, 156, 216, 148,  19,  29,  73 },  // y = d207
+  {  77,   7,  64, 116, 132, 122,  37, 126, 120 },  // y = d63
+  { 101,  21, 107, 181, 192, 103,  19,  67, 125 }   // y = tm
 };
 
-const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
-                                 [INTRA_MODES]
-                                 [INTRA_MODES - 1] = {
-  { /* above = dc */
-    { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
-    {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
-    {  73,  32,  19, 187, 222, 215,  46,  34, 100 } /* left = h */,
-    {  91,  30,  32, 116, 121, 186,  93,  86,  94 } /* left = d45 */,
-    {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
-    {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
-    {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
-    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d207 */,
-    {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
-    {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
-  }, { /* above = v */
-    {  63,  36, 126, 146, 123, 158,  60,  90,  96 } /* left = dc */,
-    {  43,  46, 168, 134, 107, 128,  69, 142,  92 } /* left = v */,
-    {  44,  29,  68, 159, 201, 177,  50,  57,  77 } /* left = h */,
-    {  58,  38,  76, 114,  97, 172,  78, 133,  92 } /* left = d45 */,
-    {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
-    {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
-    {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
-    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d207 */,
-    {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
-    {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
-  }, { /* above = h */
-    {  82,  26,  26, 171, 208, 204,  44,  32, 105 } /* left = dc */,
-    {  55,  44,  68, 166, 179, 192,  57,  57, 108 } /* left = v */,
-    {  42,  26,  11, 199, 241, 228,  23,  15,  85 } /* left = h */,
-    {  68,  42,  19, 131, 160, 199,  55,  52,  83 } /* left = d45 */,
-    {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
-    {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
-    {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
-    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d207 */,
-    {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
-    {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
-  }, { /* above = d45 */
-    { 103,  26,  36, 129, 132, 201,  83,  80,  93 } /* left = dc */,
-    {  59,  38,  83, 112, 103, 162,  98, 136,  90 } /* left = v */,
-    {  62,  30,  23, 158, 200, 207,  59,  57,  50 } /* left = h */,
-    {  67,  30,  29,  84,  86, 191, 102,  91,  59 } /* left = d45 */,
-    {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
-    {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
-    {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
-    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d207 */,
-    {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
-    {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
-  }, { /* above = d135 */
-    {  69,  23,  29, 128,  83, 199,  46,  44, 101 } /* left = dc */,
-    {  53,  40,  55, 139,  69, 183,  61,  80, 110 } /* left = v */,
-    {  40,  29,  19, 161, 180, 207,  43,  24,  91 } /* left = h */,
-    {  60,  34,  19, 105,  61, 198,  53,  64,  89 } /* left = d45 */,
-    {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
-    {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
-    {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
-    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d207 */,
-    {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
-    {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
-  }, { /* above = d117 */
-    {  64,  19,  37, 156,  66, 138,  49,  95, 133 } /* left = dc */,
-    {  46,  27,  80, 150,  55, 124,  55, 121, 135 } /* left = v */,
-    {  36,  23,  27, 165, 149, 166,  54,  64, 118 } /* left = h */,
-    {  53,  21,  36, 131,  63, 163,  60, 109,  81 } /* left = d45 */,
-    {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
-    {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
-    {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
-    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d207 */,
-    {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
-    {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
-  }, { /* above = d153 */
-    {  75,  17,  22, 136, 138, 185,  32,  34, 166 } /* left = dc */,
-    {  56,  39,  58, 133, 117, 173,  48,  53, 187 } /* left = v */,
-    {  35,  21,  12, 161, 212, 207,  20,  23, 145 } /* left = h */,
-    {  56,  29,  19, 117, 109, 181,  55,  68, 112 } /* left = d45 */,
-    {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
-    {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
-    {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
-    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d207 */,
-    {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
-    {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
-  }, { /* above = d207 */
-    {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
-    {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
-    {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
-    {  68,  36,  17, 106, 102, 206,  59,  74,  74 } /* left = d45 */,
-    {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
-    {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
-    {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
-    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d207 */,
-    {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
-    {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
-  }, { /* above = d63 */
-    {  78,  23,  39, 111, 117, 170,  74, 124,  94 } /* left = dc */,
-    {  48,  34,  86, 101,  92, 146,  78, 179, 134 } /* left = v */,
-    {  47,  22,  24, 138, 187, 178,  68,  69,  59 } /* left = h */,
-    {  56,  25,  33, 105, 112, 187,  95, 177, 129 } /* left = d45 */,
-    {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
-    {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
-    {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
-    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d207 */,
-    {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
-    {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
-  }, { /* above = tm */
-    {  65,  70,  60, 155, 159, 199,  61,  60,  81 } /* left = dc */,
-    {  44,  78, 115, 132, 119, 173,  71, 112,  93 } /* left = v */,
-    {  39,  38,  21, 184, 227, 206,  42,  32,  64 } /* left = h */,
-    {  58,  47,  36, 124, 137, 193,  80,  82,  78 } /* left = d45 */,
-    {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
-    {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
-    {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
-    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d207 */,
-    {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
-    {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
-  }
+const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+                                     [PARTITION_TYPES - 1] = {
+  // 8x8 -> 4x4
+  { 158,  97,  94 },  // a/l both not split
+  {  93,  24,  99 },  // a split, l not split
+  {  85, 119,  44 },  // l split, a not split
+  {  62,  59,  67 },  // a/l both split
+  // 16x16 -> 8x8
+  { 149,  53,  53 },  // a/l both not split
+  {  94,  20,  48 },  // a split, l not split
+  {  83,  53,  24 },  // l split, a not split
+  {  52,  18,  18 },  // a/l both split
+  // 32x32 -> 16x16
+  { 150,  40,  39 },  // a/l both not split
+  {  78,  12,  26 },  // a split, l not split
+  {  67,  33,  11 },  // l split, a not split
+  {  24,   7,   5 },  // a/l both split
+  // 64x64 -> 32x32
+  { 174,  35,  49 },  // a/l both not split
+  {  68,  11,  27 },  // a split, l not split
+  {  57,  15,   9 },  // l split, a not split
+  {  12,   3,   3 },  // a/l both split
+};
+
+static const vp9_prob default_partition_probs[PARTITION_CONTEXTS]
+                                             [PARTITION_TYPES - 1] = {
+  // 8x8 -> 4x4
+  { 199, 122, 141 },  // a/l both not split
+  { 147,  63, 159 },  // a split, l not split
+  { 148, 133, 118 },  // l split, a not split
+  { 121, 104, 114 },  // a/l both split
+  // 16x16 -> 8x8
+  { 174,  73,  87 },  // a/l both not split
+  {  92,  41,  83 },  // a split, l not split
+  {  82,  99,  50 },  // l split, a not split
+  {  53,  39,  39 },  // a/l both split
+  // 32x32 -> 16x16
+  { 177,  58,  59 },  // a/l both not split
+  {  68,  26,  63 },  // a split, l not split
+  {  52,  79,  25 },  // l split, a not split
+  {  17,  14,  12 },  // a/l both split
+  // 64x64 -> 32x32
+  { 222,  34,  30 },  // a/l both not split
+  {  72,  16,  44 },  // a split, l not split
+  {  58,  32,  12 },  // l split, a not split
+  {  10,   7,   6 },  // a/l both split
 };
 
 static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
@@ -226,7 +221,7 @@ static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
 };
 
 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = {
+const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
   -DC_PRED, 2,                      /* 0 = DC_NODE */
   -TM_PRED, 4,                      /* 1 = TM_NODE */
   -V_PRED, 6,                       /* 2 = V_NODE */
@@ -237,22 +232,20 @@ const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = {
   -D63_PRED, 16,                    /* 7 = D63_NODE */
   -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };
+struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
 
-const vp9_tree_index vp9_inter_mode_tree[6] = {
+const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
   -ZEROMV, 2,
   -NEARESTMV, 4,
   -NEARMV, -NEWMV
 };
+struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
-const vp9_tree_index vp9_partition_tree[6] = {
+const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2,
   -PARTITION_HORZ, 4,
   -PARTITION_VERT, -PARTITION_SPLIT
 };
-
-struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-
 struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
 static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
@@ -286,7 +279,7 @@ static const struct tx_probs default_tx_probs = {
     { 66  } }
 };
 
-void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
+void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
                                       unsigned int (*ct_32x32p)[2]) {
   ct_32x32p[0][0] = tx_count_32x32p[TX_4X4];
   ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] +
@@ -299,7 +292,7 @@ void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
   ct_32x32p[2][1] = tx_count_32x32p[TX_32X32];
 }
 
-void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
+void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
                                       unsigned int (*ct_16x16p)[2]) {
   ct_16x16p[0][0] = tx_count_16x16p[TX_4X4];
   ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16];
@@ -307,7 +300,7 @@ void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
   ct_16x16p[1][1] = tx_count_16x16p[TX_16X16];
 }
 
-void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
+void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
                                     unsigned int (*ct_8x8p)[2]) {
   ct_8x8p[0][0] = tx_count_8x8p[TX_4X4];
   ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
@@ -317,8 +310,8 @@ static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = {
   192, 128, 64
 };
 
-static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1]
-                                                  [SWITCHABLE_FILTERS-1] = {
+static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                                    [SWITCHABLE_FILTERS - 1] = {
   { 235, 162, },
   { 36, 255, },
   { 34, 3, },
@@ -338,7 +331,8 @@ void vp9_init_mbmode_probs(VP9_COMMON *cm) {
   vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
 }
 
-const vp9_tree_index vp9_switchable_interp_tree[SWITCHABLE_FILTERS*2-2] = {
+const vp9_tree_index vp9_switchable_interp_tree
+                         [TREE_SIZE(SWITCHABLE_FILTERS)] = {
   -EIGHTTAP, 2,
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
@@ -356,76 +350,58 @@ void vp9_entropy_mode_init() {
 #define COUNT_SAT 20
 #define MAX_UPDATE_FACTOR 128
 
-static int update_ct(vp9_prob pre_prob, vp9_prob prob, unsigned int ct[2]) {
-  return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
-}
-
-static int update_ct2(vp9_prob pre_prob, unsigned int ct[2]) {
-  return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
+static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
+  return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
 }
 
-static void update_mode_probs(int n_modes,
-                              const vp9_tree_index *tree, unsigned int *cnt,
-                              vp9_prob *pre_probs, vp9_prob *dst_probs,
-                              unsigned int tok0_offset) {
-#define MAX_PROBS 32
-  vp9_prob probs[MAX_PROBS];
-  unsigned int branch_ct[MAX_PROBS][2];
-  int t;
-
-  assert(n_modes - 1 < MAX_PROBS);
-  vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset);
-  for (t = 0; t < n_modes - 1; ++t)
-    dst_probs[t] = update_ct(pre_probs[t], probs[t], branch_ct[t]);
+static void adapt_probs(const vp9_tree_index *tree,
+                        const vp9_prob *pre_probs, const unsigned int *counts,
+                        unsigned int offset, vp9_prob *probs) {
+  tree_merge_probs(tree, pre_probs, counts, offset,
+                   COUNT_SAT, MAX_UPDATE_FACTOR, probs);
 }
 
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   int i, j;
   FRAME_CONTEXT *fc = &cm->fc;
-  FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-  FRAME_COUNTS *counts = &cm->counts;
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  const FRAME_COUNTS *counts = &cm->counts;
 
   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i],
+    fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i],
                                          counts->intra_inter[i]);
   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    fc->comp_inter_prob[i] = update_ct2(pre_fc->comp_inter_prob[i],
+    fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i],
                                         counts->comp_inter[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
-    fc->comp_ref_prob[i] = update_ct2(pre_fc->comp_ref_prob[i],
+    fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i],
                                       counts->comp_ref[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j],
+      fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j],
                                              counts->single_ref[i][j]);
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    update_mode_probs(INTER_MODES, vp9_inter_mode_tree,
-                      counts->inter_mode[i], pre_fc->inter_mode_probs[i],
-                      fc->inter_mode_probs[i], NEARESTMV);
+    adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+                counts->inter_mode[i], NEARESTMV, fc->inter_mode_probs[i]);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
-                      counts->y_mode[i], pre_fc->y_mode_prob[i],
-                      fc->y_mode_prob[i], 0);
+    adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+                counts->y_mode[i], 0, fc->y_mode_prob[i]);
 
   for (i = 0; i < INTRA_MODES; ++i)
-    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
-                      counts->uv_mode[i], pre_fc->uv_mode_prob[i],
-                      fc->uv_mode_prob[i], 0);
+    adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+                counts->uv_mode[i], 0, fc->uv_mode_prob[i]);
 
-  for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
-    update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
-                      counts->partition[i],
-                      pre_fc->partition_prob[INTER_FRAME][i],
-                      fc->partition_prob[INTER_FRAME][i], 0);
+  for (i = 0; i < PARTITION_CONTEXTS; i++)
+    adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+                counts->partition[i], 0, fc->partition_prob[i]);
 
   if (cm->mcomp_filter_type == SWITCHABLE) {
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++)
-      update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
-                        counts->switchable_interp[i],
-                        pre_fc->switchable_interp_prob[i],
-                        fc->switchable_interp_prob[i], 0);
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+      adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
+                  counts->switchable_interp[i], 0,
+                  fc->switchable_interp_prob[i]);
   }
 
   if (cm->tx_mode == TX_MODE_SELECT) {
@@ -437,23 +413,23 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
     for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
       tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; ++j)
-        fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
+        fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j],
                                              branch_ct_8x8p[j]);
 
       tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; ++j)
-        fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
+        fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j],
                                                branch_ct_16x16p[j]);
 
       tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; ++j)
-        fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
+        fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j],
                                                branch_ct_32x32p[j]);
     }
   }
 
   for (i = 0; i < MBSKIP_CONTEXTS; ++i)
-    fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i],
+    fc->mbskip_probs[i] = adapt_prob(pre_fc->mbskip_probs[i],
                                      counts->mbskip[i]);
 }
 
diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h
index 4cf4c03..38b4199 100644
--- a/libvpx/vp9/common/vp9_entropymode.h
+++ b/libvpx/vp9/common/vp9_entropymode.h
@@ -14,10 +14,9 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_treecoder.h"
 
-#define SUBMVREF_COUNT 5
 #define TX_SIZE_CONTEXTS 2
-#define MODE_UPDATE_PROB  252
 #define SWITCHABLE_FILTERS 3   // number of switchable filters
+#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
 
 // #define MODE_STATS
 
@@ -39,19 +38,20 @@ extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
 extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
 
-extern const vp9_tree_index vp9_intra_mode_tree[];
-extern const vp9_tree_index vp9_inter_mode_tree[];
+extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+                                            [PARTITION_TYPES - 1];
 
+extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
 extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
+
+extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
 extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
-// probability models for partition information
-extern const vp9_tree_index vp9_partition_tree[];
+extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
 extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
 extern const vp9_tree_index vp9_switchable_interp_tree
-                 [2 * (SWITCHABLE_FILTERS - 1)];
-
+                                [TREE_SIZE(SWITCHABLE_FILTERS)];
 extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
 
 void vp9_entropy_mode_init();
@@ -62,11 +62,11 @@ void vp9_init_mbmode_probs(struct VP9Common *cm);
 
 void vp9_adapt_mode_probs(struct VP9Common *cm);
 
-void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
+void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
                                       unsigned int (*ct_32x32p)[2]);
-void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
+void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
                                       unsigned int (*ct_16x16p)[2]);
-void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
+void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
                                     unsigned int (*ct_8x8p)[2]);
 
 #endif  // VP9_COMMON_VP9_ENTROPYMODE_H_
diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c
index 2e973e5..b061cdb 100644
--- a/libvpx/vp9/common/vp9_entropymv.c
+++ b/libvpx/vp9/common/vp9_entropymv.c
@@ -18,14 +18,14 @@
 /* Integer pel reference mv threshold for use of high-precision 1/8 mv */
 #define COMPANDED_MVREF_THRESH 8
 
-const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
+const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
   -MV_JOINT_ZERO, 2,
   -MV_JOINT_HNZVZ, 4,
   -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
 };
 struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
 
-const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
+const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   -MV_CLASS_0, 2,
   -MV_CLASS_1, 4,
   6, 8,
@@ -39,12 +39,12 @@ const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
 };
 struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
 
-const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = {
+const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
   -0, -1,
 };
 struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
 
-const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {
+const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)] = {
   -0, 2,
   -1, 4,
   -2, -3
@@ -53,8 +53,8 @@ struct vp9_token vp9_mv_fp_encodings[4];
 
 static const nmv_context default_nmv_context = {
   {32, 64, 96},
-  {
-    { /* vert component */
+  { // NOLINT
+    { /* vert component */ // NOLINT
       128,                                                  /* sign */
       {224, 144, 192, 168, 192, 176, 192, 198, 198, 245},   /* class */
       {216},                                                /* class0 */
@@ -64,7 +64,7 @@ static const nmv_context default_nmv_context = {
       160,                                                  /* class0_hp bit */
       128,                                                  /* hp */
     },
-    { /* hor component */
+    { /* hor component */ // NOLINT
       128,                                                  /* sign */
       {216, 128, 176, 160, 176, 176, 192, 198, 198, 208},   /* class */
       {208},                                                /* class0 */
@@ -149,7 +149,7 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                              int incr, int usehp) {
   int s, z, c, o, d, e, f;
-  assert (v != 0);            /* should not be zero */
+  assert(v != 0);            /* should not be zero */
   s = v < 0;
   comp_counts->sign[s] += incr;
   z = (s ? -v : v) - 1;       /* magnitude - 1 */
@@ -175,77 +175,63 @@ static void inc_mv_component(int v, nmv_component_counts *comp_counts,
   }
 }
 
+void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) {
+  if (counts != NULL) {
+    const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
+    ++counts->joints[j];
 
-void vp9_inc_mv(const MV *mv,  nmv_context_counts *counts) {
-  const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
-  ++counts->joints[j];
-
-  if (mv_joint_vertical(j)) {
-    inc_mv_component(mv->row, &counts->comps[0], 1, 1);
-  }
+    if (mv_joint_vertical(j)) {
+      inc_mv_component(mv->row, &counts->comps[0], 1, 1);
+    }
 
-  if (mv_joint_horizontal(j)) {
-    inc_mv_component(mv->col, &counts->comps[1], 1, 1);
+    if (mv_joint_horizontal(j)) {
+      inc_mv_component(mv->col, &counts->comps[1], 1, 1);
+    }
   }
 }
 
 static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
-  return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
+  return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
 }
 
-static unsigned int adapt_probs(unsigned int i,
-                                vp9_tree tree,
-                                vp9_prob this_probs[],
-                                const vp9_prob last_probs[],
-                                const unsigned int num_events[]) {
-
-
-  const unsigned int left = tree[i] <= 0
-          ? num_events[-tree[i]]
-          : adapt_probs(tree[i], tree, this_probs, last_probs, num_events);
-
-  const unsigned int right = tree[i + 1] <= 0
-          ? num_events[-tree[i + 1]]
-          : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events);
-  const unsigned int ct[2] = { left, right };
-  this_probs[i >> 1] = adapt_prob(last_probs[i >> 1], ct);
-  return left + right;
+static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
+                        const unsigned int *counts, vp9_prob *probs) {
+  tree_merge_probs(tree, pre_probs, counts, 0,
+                   MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, probs);
 }
 
-
 void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   int i, j;
 
-  FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  nmv_context *fc = &cm->fc.nmvc;
+  const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
+  const nmv_context_counts *counts = &cm->counts.mv;
 
-  nmv_context *ctx = &cm->fc.nmvc;
-  nmv_context *pre_ctx = &pre_fc->nmvc;
-  nmv_context_counts *cts = &cm->counts.mv;
-
-  adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
+  adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
+                     fc->joints);
 
   for (i = 0; i < 2; ++i) {
-    ctx->comps[i].sign = adapt_prob(pre_ctx->comps[i].sign, cts->comps[i].sign);
-    adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes,
-                pre_ctx->comps[i].classes, cts->comps[i].classes);
-    adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0,
-                pre_ctx->comps[i].class0, cts->comps[i].class0);
+    nmv_component *comp = &fc->comps[i];
+    const nmv_component *pre_comp = &pre_fc->comps[i];
+    const nmv_component_counts *c = &counts->comps[i];
+
+    comp->sign = adapt_prob(pre_comp->sign, c->sign);
+    adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+                comp->classes);
+    adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-        ctx->comps[i].bits[j] = adapt_prob(pre_ctx->comps[i].bits[j],
-                                           cts->comps[i].bits[j]);
+      comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]);
 
     for (j = 0; j < CLASS0_SIZE; ++j)
-      adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j],
-                  pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]);
+      adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j],
+                  comp->class0_fp[j]);
 
-    adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp,
-                cts->comps[i].fp);
+    adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
 
     if (allow_hp) {
-      ctx->comps[i].class0_hp = adapt_prob(pre_ctx->comps[i].class0_hp,
-                                           cts->comps[i].class0_hp);
-      ctx->comps[i].hp = adapt_prob(pre_ctx->comps[i].hp, cts->comps[i].hp);
+      comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp);
+      comp->hp = adapt_prob(pre_comp->hp, c->hp);
     }
   }
 }
diff --git a/libvpx/vp9/common/vp9_entropymv.h b/libvpx/vp9/common/vp9_entropymv.h
index a10c933..d843f5b 100644
--- a/libvpx/vp9/common/vp9_entropymv.h
+++ b/libvpx/vp9/common/vp9_entropymv.h
@@ -13,7 +13,7 @@
 #define VP9_COMMON_VP9_ENTROPYMV_H_
 
 #include "vp9/common/vp9_treecoder.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_blockd.h"
 
 struct VP9Common;
@@ -43,9 +43,6 @@ static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
   return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
 }
 
-extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];
-extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
-
 /* Symbols for coding magnitude class of nonzero components */
 #define MV_CLASSES     11
 typedef enum {
@@ -62,9 +59,6 @@ typedef enum {
   MV_CLASS_10 = 10,    /* (1024,2048] integer pel */
 } MV_CLASS_TYPE;
 
-extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];
-extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
-
 #define CLASS0_BITS    1  /* bits at integer precision for class 0 */
 #define CLASS0_SIZE    (1 << CLASS0_BITS)
 #define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
@@ -73,10 +67,20 @@ extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
 #define MV_MAX         ((1 << MV_MAX_BITS) - 1)
 #define MV_VALS        ((MV_MAX << 1) + 1)
 
-extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];
+#define MV_IN_USE_BITS 14
+#define MV_UPP   ((1 << MV_IN_USE_BITS) - 1)
+#define MV_LOW   (-(1 << MV_IN_USE_BITS))
+
+extern const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)];
+extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
+
+extern const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)];
+extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
+
+extern const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)];
 extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
 
-extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2];
+extern const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)];
 extern struct vp9_token vp9_mv_fp_encodings[4];
 
 typedef struct {
@@ -108,7 +112,6 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);
 
 
 typedef struct {
-  unsigned int mvcount[MV_VALS];
   unsigned int sign[2];
   unsigned int classes[MV_CLASSES];
   unsigned int class0[CLASS0_SIZE];
diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h
index 1bf0742..1651b90 100644
--- a/libvpx/vp9/common/vp9_enums.h
+++ b/libvpx/vp9/common/vp9_enums.h
@@ -50,7 +50,7 @@ typedef enum PARTITION_TYPE {
 } PARTITION_TYPE;
 
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
-#define NUM_PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
+#define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
 
 typedef enum {
   TX_4X4 = 0,                      // 4x4 dct transform
@@ -76,4 +76,15 @@ typedef enum {
   ADST_ADST = 3                       // ADST in both directions
 } TX_TYPE;
 
+typedef enum {
+  UNKNOWN    = 0,
+  BT_601     = 1,  // YUV
+  BT_709     = 2,  // YUV
+  SMPTE_170  = 3,  // YUV
+  SMPTE_240  = 4,  // YUV
+  RESERVED_1 = 5,
+  RESERVED_2 = 6,
+  SRGB       = 7   // RGB
+} COLOR_SPACE;
+
 #endif  // VP9_COMMON_VP9_ENUMS_H_
diff --git a/libvpx/vp9/common/vp9_filter.c b/libvpx/vp9/common/vp9_filter.c
index 4ac2bc9..79ace14 100644
--- a/libvpx/vp9/common/vp9_filter.c
+++ b/libvpx/vp9/common/vp9_filter.c
@@ -8,12 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_filter.h"
 
-DECLARE_ALIGNED(256, const int16_t,
-                vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const subpel_kernel,
+                vp9_bilinear_filters[SUBPEL_SHIFTS]) = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
   { 0, 0, 0, 120,   8, 0, 0, 0 },
   { 0, 0, 0, 112,  16, 0, 0, 0 },
@@ -33,8 +35,8 @@ DECLARE_ALIGNED(256, const int16_t,
 };
 
 // Lagrangian interpolation filter
-DECLARE_ALIGNED(256, const int16_t,
-                vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const subpel_kernel,
+                vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
   { 0,   0,   0, 128,   0,   0,   0,  0},
   { 0,   1,  -5, 126,   8,  -3,   1,  0},
   { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -54,8 +56,8 @@ DECLARE_ALIGNED(256, const int16_t,
 };
 
 // DCT based filter
-DECLARE_ALIGNED(256, const int16_t,
-                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const subpel_kernel,
+                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
   {0,   0,   0, 128,   0,   0,   0, 0},
   {-1,   3,  -7, 127,   8,  -3,   1, 0},
   {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -75,8 +77,8 @@ DECLARE_ALIGNED(256, const int16_t,
 };
 
 // freqmultiplier = 0.5
-DECLARE_ALIGNED(256, const int16_t,
-                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const subpel_kernel,
+                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
   { 0,  0,  0, 128,  0,  0,  0,  0},
   {-3, -1, 32,  64, 38,  1, -3,  0},
   {-2, -2, 29,  63, 41,  2, -3,  0},
@@ -94,3 +96,16 @@ DECLARE_ALIGNED(256, const int16_t,
   { 0, -3,  2,  41, 63, 29, -2, -2},
   { 0, -3,  1,  38, 64, 32, -1, -3}
 };
+
+
+static const subpel_kernel* vp9_filter_kernels[4] = {
+  vp9_sub_pel_filters_8,
+  vp9_sub_pel_filters_8lp,
+  vp9_sub_pel_filters_8s,
+  vp9_bilinear_filters
+};
+
+const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type) {
+  return vp9_filter_kernels[type];
+}
+
diff --git a/libvpx/vp9/common/vp9_filter.h b/libvpx/vp9/common/vp9_filter.h
index 7b1ffae..b1e7e64 100644
--- a/libvpx/vp9/common/vp9_filter.h
+++ b/libvpx/vp9/common/vp9_filter.h
@@ -11,19 +11,37 @@
 #ifndef VP9_COMMON_VP9_FILTER_H_
 #define VP9_COMMON_VP9_FILTER_H_
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
+#define FILTER_BITS 7
+
 #define SUBPEL_BITS 4
 #define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
 #define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
 #define SUBPEL_TAPS 8
 
-extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS];
+typedef enum {
+  EIGHTTAP = 0,
+  EIGHTTAP_SMOOTH = 1,
+  EIGHTTAP_SHARP = 2,
+  BILINEAR = 3,
+  SWITCHABLE = 4  /* should be the last one */
+} INTERPOLATION_TYPE;
+
+typedef int16_t subpel_kernel[SUBPEL_TAPS];
+
+struct subpix_fn_table {
+  const subpel_kernel *filter_x;
+  const subpel_kernel *filter_y;
+};
+
+const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type);
+
+extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
 
 // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
 // filter kernel as a 2 tap filter.
diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c
index 49a731f..b91c501 100644
--- a/libvpx/vp9/common/vp9_findnearmv.c
+++ b/libvpx/vp9/common/vp9_findnearmv.c
@@ -22,14 +22,12 @@ static void lower_mv_precision(MV *mv, int allow_hp) {
 }
 
 
-void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
-                           int_mv *mvlist,
-                           int_mv *nearest,
-                           int_mv *near) {
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
+                           int_mv *mvlist, int_mv *nearest, int_mv *near) {
   int i;
   // Make sure all the candidates are properly clamped etc
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-    lower_mv_precision(&mvlist[i].as_mv, xd->allow_high_precision_mv);
+    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
     clamp_mv2(&mvlist[i].as_mv, xd);
   }
   *nearest = mvlist[0];
@@ -37,27 +35,28 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
 }
 
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                   const TileInfo *const tile,
                                    int_mv *dst_nearest,
                                    int_mv *dst_near,
                                    int block_idx, int ref_idx,
                                    int mi_row, int mi_col) {
   int_mv dst_list[MAX_MV_REF_CANDIDATES];
   int_mv mv_list[MAX_MV_REF_CANDIDATES];
-  MODE_INFO *const mi = xd->this_mi;
+  MODE_INFO *const mi = xd->mi_8x8[0];
 
   assert(ref_idx == 0 || ref_idx == 1);
   assert(MAX_MV_REF_CANDIDATES == 2);  // makes code here slightly easier
 
-  vp9_find_mv_refs_idx(cm, xd, mi, xd->last_mi,
+  vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi,
                        mi->mbmi.ref_frame[ref_idx],
                        mv_list, block_idx, mi_row, mi_col);
 
   dst_list[1].as_int = 0;
   if (block_idx == 0) {
-    memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv));
+    vpx_memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv));
   } else if (block_idx == 1 || block_idx == 2) {
     int dst = 0, n;
-    union b_mode_info *bmi = mi->bmi;
+    b_mode_info *bmi = mi->bmi;
 
     dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int;
     for (n = 0; dst < MAX_MV_REF_CANDIDATES &&
@@ -66,7 +65,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
         dst_list[dst++].as_int = mv_list[n].as_int;
   } else {
     int dst = 0, n;
-    union b_mode_info *bmi = mi->bmi;
+    b_mode_info *bmi = mi->bmi;
 
     assert(block_idx == 3);
     dst_list[dst++].as_int = bmi[2].as_mv[ref_idx].as_int;
diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h
index ad0d882..2362caa 100644
--- a/libvpx/vp9/common/vp9_findnearmv.h
+++ b/libvpx/vp9/common/vp9_findnearmv.h
@@ -23,10 +23,8 @@
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
 // score to use as ref motion vector
-void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
-                           int_mv *mvlist,
-                           int_mv *nearest,
-                           int_mv *near);
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
+                           int_mv *mvlist, int_mv *nearest, int_mv *near);
 
 // TODO(jingning): this mv clamping function should be block size dependent.
 static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
@@ -36,57 +34,39 @@ static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
                xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
-void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm,
-                                   MACROBLOCKD *xd,
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                   const TileInfo *const tile,
                                    int_mv *dst_nearest,
                                    int_mv *dst_near,
                                    int block_idx, int ref_idx,
                                    int mi_row, int mi_col);
 
-static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb,
-                                          const MODE_INFO *left_mb, int b) {
-  // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
-  // understand this condition. This will go away soon.
-  const MODE_INFO *mi = cur_mb;
-
+static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi,
+                                          const MODE_INFO *left_mi, int b) {
   if (b == 0 || b == 2) {
-    /* On L edge, get from MB to left of us */
-    mi = left_mb;
-    if (!mi)
+    if (!left_mi || is_inter_block(&left_mi->mbmi))
       return DC_PRED;
 
-    if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
-      return DC_PRED;
-    } else if (mi->mbmi.sb_type < BLOCK_8X8) {
-      return ((mi->bmi + 1 + b)->as_mode);
-    } else {
-      return mi->mbmi.mode;
-    }
+    return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
+                                             : left_mi->mbmi.mode;
+  } else {
+    assert(b == 1 || b == 3);
+    return cur_mi->bmi[b - 1].as_mode;
   }
-  assert(b == 1 || b == 3);
-  return (mi->bmi + b - 1)->as_mode;
 }
 
-static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
-                                           const MODE_INFO *above_mb, int b) {
-  const MODE_INFO *mi = cur_mb;
-
-  if (!(b >> 1)) {
-    /* On top edge, get from MB above us */
-    mi = above_mb;
-    if (!mi)
+static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi,
+                                           const MODE_INFO *above_mi, int b) {
+  if (b == 0 || b == 1) {
+    if (!above_mi || is_inter_block(&above_mi->mbmi))
       return DC_PRED;
 
-    if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
-      return DC_PRED;
-    } else if (mi->mbmi.sb_type < BLOCK_8X8) {
-      return ((mi->bmi + 2 + b)->as_mode);
-    } else {
-      return mi->mbmi.mode;
-    }
+    return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode
+                                              : above_mi->mbmi.mode;
+  } else {
+    assert(b == 2 || b == 3);
+    return cur_mi->bmi[b - 2].as_mode;
   }
-
-  return (mi->bmi + b - 2)->as_mode;
 }
 
 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c
index a224525..ea8683e 100644
--- a/libvpx/vp9/common/vp9_idct.c
+++ b/libvpx/vp9/common/vp9_idct.c
@@ -18,20 +18,20 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
    0.5 shifts per pixel. */
   int i;
   int16_t output[16];
   int a1, b1, c1, d1, e1;
-  int16_t *ip = input;
+  const int16_t *ip = input;
   int16_t *op = output;
 
   for (i = 0; i < 4; i++) {
-    a1 = ip[0] >> WHT_UPSCALE_FACTOR;
-    c1 = ip[1] >> WHT_UPSCALE_FACTOR;
-    d1 = ip[2] >> WHT_UPSCALE_FACTOR;
-    b1 = ip[3] >> WHT_UPSCALE_FACTOR;
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
     a1 += c1;
     d1 -= b1;
     e1 = (a1 - d1) >> 1;
@@ -60,24 +60,24 @@ void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
-    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1);
-    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1);
-    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1);
+    dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
+    dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
+    dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
+    dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
 
     ip++;
     dest++;
   }
 }
 
-void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
   int i;
   int a1, e1;
   int16_t tmp[4];
-  int16_t *ip = in;
+  const int16_t *ip = in;
   int16_t *op = tmp;
 
-  a1 = ip[0] >> WHT_UPSCALE_FACTOR;
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
   op[0] = a1;
@@ -96,7 +96,7 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
   }
 }
 
-void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
+static void idct4_1d(const int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
   // stage 1
@@ -116,7 +116,7 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
   output[3] = step[0] - step[3];
 }
 
-void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[4 * 4];
   int16_t *outptr = out;
   int i, j;
@@ -124,7 +124,7 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
 
   // Rows
   for (i = 0; i < 4; ++i) {
-    vp9_idct4_1d(input, outptr);
+    idct4_1d(input, outptr);
     input += 4;
     outptr += 4;
   }
@@ -133,14 +133,14 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
-    vp9_idct4_1d(temp_in, temp_out);
+    idct4_1d(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * stride + i]);
   }
 }
 
-void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
   int a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -156,7 +156,7 @@ void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   }
 }
 
-static void idct8_1d(int16_t *input, int16_t *output) {
+static void idct8_1d(const int16_t *input, int16_t *output) {
   int16_t step1[8], step2[8];
   int temp1, temp2;
   // stage 1
@@ -174,7 +174,7 @@ static void idct8_1d(int16_t *input, int16_t *output) {
   step1[6] = dct_const_round_shift(temp2);
 
   // stage 2 & stage 3 - even half
-  vp9_idct4_1d(step1, step1);
+  idct4_1d(step1, step1);
 
   // stage 2 - odd half
   step2[4] = step1[4] + step1[5];
@@ -201,7 +201,7 @@ static void idct8_1d(int16_t *input, int16_t *output) {
   output[7] = step1[0] - step1[7];
 }
 
-void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[8 * 8];
   int16_t *outptr = out;
   int i, j;
@@ -220,12 +220,12 @@ void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
       temp_in[j] = out[j * 8 + i];
     idct8_1d(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * stride + i]);
   }
 }
 
-void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int i, j;
   int a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -234,11 +234,11 @@ void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
       dest[i] = clip_pixel(dest[i] + a1);
-    dest += dest_stride;
+    dest += stride;
   }
 }
 
-static void iadst4_1d(int16_t *input, int16_t *output) {
+static void iadst4_1d(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[0];
@@ -280,13 +280,13 @@ static void iadst4_1d(int16_t *input, int16_t *output) {
   output[3] = dct_const_round_shift(s3);
 }
 
-void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
-                            int tx_type) {
+void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
   const transform_2d IHT_4[] = {
-    { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0
-    { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1
-    { vp9_idct4_1d,  iadst4_1d },      // DCT_ADST = 2
-    { iadst4_1d, iadst4_1d }           // ADST_ADST = 3
+    { idct4_1d, idct4_1d  },  // DCT_DCT  = 0
+    { iadst4_1d, idct4_1d  },   // ADST_DCT = 1
+    { idct4_1d, iadst4_1d },    // DCT_ADST = 2
+    { iadst4_1d, iadst4_1d }      // ADST_ADST = 3
   };
 
   int i, j;
@@ -307,11 +307,11 @@ void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
       temp_in[j] = out[j * 4 + i];
     IHT_4[tx_type].cols(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * stride + i]);
   }
 }
-static void iadst8_1d(int16_t *input, int16_t *output) {
+static void iadst8_1d(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[7];
@@ -395,8 +395,8 @@ static const transform_2d IHT_8[] = {
   { iadst8_1d, iadst8_1d }   // ADST_ADST = 3
 };
 
-void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
-                            int tx_type) {
+void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
   int i, j;
   int16_t out[8 * 8];
   int16_t *outptr = out;
@@ -416,12 +416,12 @@ void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
       temp_in[j] = out[j * 8 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * dest_stride + i]);  }
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * stride + i]);
+  }
 }
 
-void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
-                                int dest_stride) {
+void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[8 * 8] = { 0 };
   int16_t *outptr = out;
   int i, j;
@@ -441,12 +441,12 @@ void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
       temp_in[j] = out[j * 8 + i];
     idct8_1d(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * stride + i]);
   }
 }
 
-static void idct16_1d(int16_t *input, int16_t *output) {
+static void idct16_1d(const int16_t *input, int16_t *output) {
   int16_t step1[16], step2[16];
   int temp1, temp2;
 
@@ -611,7 +611,7 @@ static void idct16_1d(int16_t *input, int16_t *output) {
   output[15] = step2[0] - step2[15];
 }
 
-void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[16 * 16];
   int16_t *outptr = out;
   int i, j;
@@ -630,12 +630,12 @@ void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
       temp_in[j] = out[j * 16 + i];
     idct16_1d(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);
   }
 }
 
-void iadst16_1d(int16_t *input, int16_t *output) {
+static void iadst16_1d(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -813,8 +813,8 @@ static const transform_2d IHT_16[] = {
   { iadst16_1d, iadst16_1d }   // ADST_ADST = 3
 };
 
-void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
-                              int tx_type) {
+void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
   int i, j;
   int16_t out[16 * 16];
   int16_t *outptr = out;
@@ -834,12 +834,11 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
       temp_in[j] = out[j * 16 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);  }
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);  }
 }
 
-void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
-                                  int dest_stride) {
+void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[16 * 16] = { 0 };
   int16_t *outptr = out;
   int i, j;
@@ -859,13 +858,12 @@ void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
       temp_in[j] = out[j*16 + i];
     idct16_1d(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);
   }
 }
 
-void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
+void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int i, j;
   int a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -874,11 +872,11 @@ void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
       dest[i] = clip_pixel(dest[i] + a1);
-    dest += dest_stride;
+    dest += stride;
   }
 }
 
-static void idct32_1d(int16_t *input, int16_t *output) {
+static void idct32_1d(const int16_t *input, int16_t *output) {
   int16_t step1[32], step2[32];
   int temp1, temp2;
 
@@ -1245,7 +1243,7 @@ static void idct32_1d(int16_t *input, int16_t *output) {
   output[31] = step1[0] - step1[31];
 }
 
-void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[32 * 32];
   int16_t *outptr = out;
   int i, j;
@@ -1253,6 +1251,44 @@ void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
 
   // Rows
   for (i = 0; i < 32; ++i) {
+    int16_t zero_coeff[16];
+    for (j = 0; j < 16; ++j)
+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+    for (j = 0; j < 8; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 4; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 2; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+    if (zero_coeff[0] | zero_coeff[1])
+      idct32_1d(input, outptr);
+    else
+      vpx_memset(outptr, 0, sizeof(int16_t) * 32);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_1d(temp_in, temp_out);
+    for (j = 0; j < 32; ++j)
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);
+  }
+}
+
+void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
+  int16_t out[32 * 32] = {0};
+  int16_t *outptr = out;
+  int i, j;
+  int16_t temp_in[32], temp_out[32];
+
+  // Rows
+  // only upper-left 8x8 has non-zero coeff
+  for (i = 0; i < 8; ++i) {
     idct32_1d(input, outptr);
     input += 32;
     outptr += 32;
@@ -1264,13 +1300,116 @@ void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
       temp_in[j] = out[j * 32 + i];
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);
   }
 }
 
-void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
+void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  int a1;
+
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
-  output[0] = ROUND_POWER_OF_TWO(out, 6);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  for (j = 0; j < 32; ++j) {
+    for (i = 0; i < 32; ++i)
+      dest[i] = clip_pixel(dest[i] + a1);
+    dest += stride;
+  }
+}
+
+// idct
+void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+  if (eob > 1)
+    vp9_idct4x4_16_add(input, dest, stride);
+  else
+    vp9_idct4x4_1_add(input, dest, stride);
+}
+
+
+void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+  if (eob > 1)
+    vp9_iwht4x4_16_add(input, dest, stride);
+  else
+    vp9_iwht4x4_1_add(input, dest, stride);
+}
+
+void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+  // If dc is 1, then input[0] is the reconstructed value, do not need
+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to decide what to do.
+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+  // Combine that with code here.
+  if (eob) {
+    if (eob == 1)
+      // DC only DCT coefficient
+      vp9_idct8x8_1_add(input, dest, stride);
+    else if (eob <= 10)
+      vp9_idct8x8_10_add(input, dest, stride);
+    else
+      vp9_idct8x8_64_add(input, dest, stride);
+  }
+}
+
+void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
+                       int eob) {
+  /* The calculation can be simplified if there are not many non-zero dct
+   * coefficients. Use eobs to separate different cases. */
+  if (eob) {
+    if (eob == 1)
+      /* DC only DCT coefficient. */
+      vp9_idct16x16_1_add(input, dest, stride);
+    else if (eob <= 10)
+      vp9_idct16x16_10_add(input, dest, stride);
+    else
+      vp9_idct16x16_256_add(input, dest, stride);
+  }
+}
+
+void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+                       int eob) {
+  if (eob) {
+    if (eob == 1)
+      vp9_idct32x32_1_add(input, dest, stride);
+    else if (eob <= 34)
+      // non-zero coeff only in upper-left 8x8
+      vp9_idct32x32_34_add(input, dest, stride);
+    else
+      vp9_idct32x32_1024_add(input, dest, stride);
+  }
+}
+
+// iht
+void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob) {
+  if (tx_type == DCT_DCT)
+    vp9_idct4x4_add(input, dest, stride, eob);
+  else
+    vp9_iht4x4_16_add(input, dest, stride, tx_type);
+}
+
+void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob) {
+  if (tx_type == DCT_DCT) {
+    vp9_idct8x8_add(input, dest, stride, eob);
+  } else {
+    if (eob > 0) {
+      vp9_iht8x8_64_add(input, dest, stride, tx_type);
+    }
+  }
+}
+
+void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                      int stride, int eob) {
+  if (tx_type == DCT_DCT) {
+    vp9_idct16x16_add(input, dest, stride, eob);
+  } else {
+    if (eob > 0) {
+      vp9_iht16x16_256_add(input, dest, stride, tx_type);
+    }
+  }
 }
diff --git a/libvpx/vp9/common/vp9_idct.h b/libvpx/vp9/common/vp9_idct.h
index 0c47da6..2b3f35f 100644
--- a/libvpx/vp9/common/vp9_idct.h
+++ b/libvpx/vp9/common/vp9_idct.h
@@ -16,16 +16,18 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
 
 
 // Constants and Macros used by all idct/dct functions
 #define DCT_CONST_BITS 14
 #define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
 
-#define WHT_UPSCALE_FACTOR 2
+#define UNIT_QUANT_SHIFT 2
+#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
 
 #define pair_set_epi16(a, b) \
-  _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))
+  _mm_set_epi16(b, a, b, a, b, a, b, a)
 
 #define pair_set_epi32(a, b) \
   _mm_set_epi32(b, a, b, a)
@@ -79,10 +81,27 @@ static INLINE int dct_const_round_shift(int input) {
   return rv;
 }
 
-typedef void (*transform_1d)(int16_t*, int16_t*);
+typedef void (*transform_1d)(const int16_t*, int16_t*);
 
 typedef struct {
   transform_1d cols, rows;  // vertical and horizontal
 } transform_2d;
 
+void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+
+void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int
+                       eob);
+void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+                       int eob);
+
+void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob);
+void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob);
+void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                      int stride, int eob);
+
+
 #endif  // VP9_COMMON_VP9_IDCT_H_
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index cfb5cd4..218e12e 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_reconinter.h"
@@ -16,12 +16,6 @@
 
 #include "vp9/common/vp9_seg_common.h"
 
-struct loop_filter_info {
-  const uint8_t *mblim;
-  const uint8_t *lim;
-  const uint8_t *hev_thr;
-};
-
 // This structure holds bit masks for all 8x8 blocks in a 64x64 region.
 // Each 1 bit represents a position in which we want to apply the loop filter.
 // Left_ entries refer to whether we apply a filter on the border to the
@@ -259,8 +253,8 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
     if (block_inside_limit < 1)
       block_inside_limit = 1;
 
-    vpx_memset(lfi->lim[lvl], block_inside_limit, SIMD_WIDTH);
-    vpx_memset(lfi->mblim[lvl], (2 * (lvl + 2) + block_inside_limit),
+    vpx_memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+    vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
                SIMD_WIDTH);
   }
 }
@@ -268,7 +262,7 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
 void vp9_loop_filter_init(VP9_COMMON *cm) {
   loop_filter_info_n *lfi = &cm->lf_info;
   struct loopfilter *lf = &cm->lf;
-  int i;
+  int lvl;
 
   // init limits for given sharpness
   update_sharpness(lfi, lf->sharpness_level);
@@ -278,8 +272,8 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
   lf_init_lut(lfi);
 
   // init hev threshold const vectors
-  for (i = 0; i < 4; i++)
-    vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+    vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
 }
 
 void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
@@ -316,13 +310,13 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
       continue;
     }
 
-    intra_lvl = lvl_seg + (lf->ref_deltas[INTRA_FRAME] << n_shift);
+    intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * (1 << n_shift);
     lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
 
     for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref)
       for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
-        const int inter_lvl = lvl_seg + (lf->ref_deltas[ref] << n_shift)
-                                      + (lf->mode_deltas[mode] << n_shift);
+        const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * (1 << n_shift)
+                                      + lf->mode_deltas[mode] * (1 << n_shift);
         lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
       }
   }
@@ -330,16 +324,14 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
 
 static int build_lfi(const loop_filter_info_n *lfi_n,
                      const MB_MODE_INFO *mbmi,
-                     struct loop_filter_info *lfi) {
+                     const loop_filter_thresh **lfi) {
   const int seg = mbmi->segment_id;
   const int ref = mbmi->ref_frame[0];
   const int mode = lfi_n->mode_lf_lut[mbmi->mode];
   const int filter_level = lfi_n->lvl[seg][ref][mode];
 
   if (filter_level > 0) {
-    lfi->mblim = lfi_n->mblim[filter_level];
-    lfi->lim = lfi_n->lim[filter_level];
-    lfi->hev_thr = lfi_n->hev_thr[filter_level >> 4];
+    *lfi = &lfi_n->lfthr[filter_level];
     return 1;
   } else {
     return 0;
@@ -351,11 +343,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
                                     unsigned int mask_8x8,
                                     unsigned int mask_4x4,
                                     unsigned int mask_4x4_int,
-                                    const struct loop_filter_info *lfi) {
+                                    const loop_filter_thresh **p_lfi) {
   unsigned int mask;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= 1) {
+    const loop_filter_thresh *lfi = *p_lfi;
+
     if (mask & 1) {
       if (mask_16x16 & 1) {
         vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
@@ -379,7 +373,7 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
       vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, 1);
     s += 8;
-    lfi++;
+    p_lfi++;
     mask_16x16 >>= 1;
     mask_8x8 >>= 1;
     mask_4x4 >>= 1;
@@ -393,12 +387,14 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
                                      unsigned int mask_4x4,
                                      unsigned int mask_4x4_int,
                                      int only_4x4_1,
-                                     const struct loop_filter_info *lfi) {
+                                     const loop_filter_thresh **p_lfi) {
   unsigned int mask;
   int count;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= count) {
+    const loop_filter_thresh *lfi = *p_lfi;
+
     count = 1;
     if (mask & 1) {
       if (!only_4x4_1) {
@@ -432,7 +428,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
                                         lfi->lim, lfi->hev_thr, 1);
     }
     s += 8 * count;
-    lfi += count;
+    p_lfi += count;
     mask_16x16 >>= count;
     mask_8x8 >>= count;
     mask_4x4 >>= count;
@@ -805,7 +801,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
   unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
   int r, c;
 
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
@@ -834,7 +830,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
       // Filter level can vary per MI
-      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
+      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]))
         continue;
 
       // Build masks based on the transform size of each block
@@ -925,7 +921,7 @@ static void filter_block_plane(VP9_COMMON *const cm,
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
   unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
   int r, c;
   int row_shift = 3 - ss_x;
   int row_mask = 0xff >> (ss_x << 2);
@@ -938,8 +934,8 @@ static void filter_block_plane(VP9_COMMON *const cm,
     // Determine the vertical edges that need filtering
     for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
       const MODE_INFO *mi = mi_8x8[c];
-      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
-        continue;
+
+      build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]);
     }
     if (!plane->plane_type) {
       mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h
index 91d40ac..62389ea 100644
--- a/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libvpx/vp9/common/vp9_loopfilter.h
@@ -12,7 +12,7 @@
 #define VP9_COMMON_VP9_LOOPFILTER_H_
 
 #include "vpx_ports/mem.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_seg_common.h"
@@ -46,12 +46,13 @@ struct loopfilter {
 // Need to align this structure so when it is declared and
 // passed it can be loaded into vector registers.
 typedef struct {
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
-                  mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
-                  lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
-                  hev_thr[4][SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+  loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
   uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
   uint8_t mode_lf_lut[MB_MODE_COUNT];
 } loop_filter_info_n;
diff --git a/libvpx/vp9/common/vp9_loopfilter_filters.c b/libvpx/vp9/common/vp9_loopfilter_filters.c
index 88130d8..2c4bf6c 100644
--- a/libvpx/vp9/common/vp9_loopfilter_filters.c
+++ b/libvpx/vp9/common/vp9_loopfilter_filters.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c
index bfeeb57..8df8aec 100644
--- a/libvpx/vp9/common/vp9_mvref_common.c
+++ b/libvpx/vp9/common/vp9_mvref_common.c
@@ -108,7 +108,7 @@ static const int idx_n_column_to_subblock[4][2] = {
 };
 
 // clamp_mv_ref
-#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
 
 static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
   clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
@@ -119,10 +119,9 @@ static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
 
 // This function returns either the appropriate sub block or block's mv
 // on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate,
-                                      int check_sub_blocks, int which_mv,
+static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
                                       int search_col, int block_idx) {
-  return check_sub_blocks && candidate->mbmi.sb_type < BLOCK_8X8
+  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
           ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
               .as_mv[which_mv]
           : candidate->mbmi.mv[which_mv];
@@ -171,17 +170,19 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
 
 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
-static INLINE int is_inside(const VP9_COMMON *cm, int mi_col, int mi_row,
+static INLINE int is_inside(const TileInfo *const tile,
+                            int mi_col, int mi_row, int mi_rows,
                             const MV *mv) {
   return !(mi_row + mv->row < 0 ||
-           mi_col + mv->col < cm->cur_tile_mi_col_start ||
-           mi_row + mv->row >= cm->mi_rows ||
-           mi_col + mv->col >= cm->cur_tile_mi_col_end);
+           mi_col + mv->col < tile->mi_col_start ||
+           mi_row + mv->row >= mi_rows ||
+           mi_col + mv->col >= tile->mi_col_end);
 }
 
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
 void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                          const TileInfo *const tile,
                           MODE_INFO *mi, const MODE_INFO *prev_mi,
                           MV_REFERENCE_FRAME ref_frame,
                           int_mv *mv_ref_list,
@@ -202,8 +203,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   // and we also need to keep a mode count.
   for (i = 0; i < 2; ++i) {
     const MV *const mv_ref = &mv_ref_search[i];
-    if (is_inside(cm, mi_col, mi_row, mv_ref)) {
-      const int check_sub_blocks = block_idx >= 0;
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
       const MODE_INFO *const candidate_mi = xd->mi_8x8[mv_ref->col + mv_ref->row
                                                    * xd->mode_info_stride];
       const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
@@ -212,13 +212,13 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
 
       // Check if the candidate comes from the same reference frame.
       if (candidate->ref_frame[0] == ref_frame) {
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 0,
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0,
                                          mv_ref->col, block_idx));
         different_ref_found = candidate->ref_frame[1] != ref_frame;
       } else {
         if (candidate->ref_frame[1] == ref_frame)
           // Add second motion vector if it has the same ref_frame.
-          ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 1,
+          ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1,
                                            mv_ref->col, block_idx));
         different_ref_found = 1;
       }
@@ -230,7 +230,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   // mode counts.
   for (; i < MVREF_NEIGHBOURS; ++i) {
     const MV *const mv_ref = &mv_ref_search[i];
-    if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
       const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
                                             mv_ref->row
                                             * xd->mode_info_stride]->mbmi;
@@ -260,7 +260,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   if (different_ref_found) {
     for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
       const MV *mv_ref = &mv_ref_search[i];
-      if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
         const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
                                                           mv_ref->row
                                               * xd->mode_info_stride]->mbmi;
diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h
index 39ebdb0..ce4c559 100644
--- a/libvpx/vp9/common/vp9_mvref_common.h
+++ b/libvpx/vp9/common/vp9_mvref_common.h
@@ -15,6 +15,7 @@
 #define VP9_COMMON_VP9_MVREF_COMMON_H_
 
 void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                          const TileInfo *const tile,
                           MODE_INFO *mi, const MODE_INFO *prev_mi,
                           MV_REFERENCE_FRAME ref_frame,
                           int_mv *mv_ref_list,
@@ -22,11 +23,12 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                           int mi_row, int mi_col);
 
 static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                                    const TileInfo *const tile,
                                     MODE_INFO *mi, const MODE_INFO *prev_mi,
                                     MV_REFERENCE_FRAME ref_frame,
                                     int_mv *mv_ref_list,
                                     int mi_row, int mi_col) {
-  vp9_find_mv_refs_idx(cm, xd, mi, prev_mi, ref_frame,
+  vp9_find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame,
                        mv_ref_list, -1, mi_row, mi_col);
 }
 
diff --git a/libvpx/vp9/common/vp9_onyx.h b/libvpx/vp9/common/vp9_onyx.h
index f424e6a..452dd6b 100644
--- a/libvpx/vp9/common/vp9_onyx.h
+++ b/libvpx/vp9/common/vp9_onyx.h
@@ -13,7 +13,7 @@
 
 #ifdef __cplusplus
 extern "C"
-{
+{ // NOLINT
 #endif
 
 #include "./vpx_config.h"
@@ -33,7 +33,6 @@ extern "C"
     FOURFIVE    = 1,
     THREEFIVE   = 2,
     ONETWO      = 3
-
   } VPX_SCALING;
 
   typedef enum {
@@ -71,42 +70,48 @@ extern "C"
                   //   3 - lowest quality/fastest decode
     int width;  // width of data passed to the compressor
     int height;  // height of data passed to the compressor
-    double framerate;       // set to passed in framerate
-    int64_t target_bandwidth;    // bandwidth to be used in kilobits per second
+    double framerate;  // set to passed in framerate
+    int64_t target_bandwidth;  // bandwidth to be used in kilobits per second
 
-    int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0
-    int Sharpness;          // parameter used for sharpening output: recommendation 0:
+    int noise_sensitivity;  // pre processing blur: recommendation 0
+    int Sharpness;  // sharpening output: recommendation 0:
     int cpu_used;
     unsigned int rc_max_intra_bitrate_pct;
 
     // mode ->
-    // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing
-    //    a television signal or feed from a live camera). ( speed setting controls how fast )
-    // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to
-    //    encode the output. ( speed setting controls how fast )
-    // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding
-    //    speed. The output is compressed at the highest possible quality. This option takes the longest
-    //    amount of time to encode. ( speed setting ignored )
-    // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding
-    //    pass. ( speed setting controls how fast )
-    // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding
-    //    pass to create the compressed output. ( speed setting controls how fast )
-    // (5)=Two Pass - Second Pass Best.  The encoder uses the statistics that were generated in the first
-    //    encoding pass to create the compressed output using the highest possible quality, and taking a
+    // (0)=Realtime/Live Encoding. This mode is optimized for realtime
+    //     encoding (for example, capturing a television signal or feed from
+    //     a live camera). ( speed setting controls how fast )
+    // (1)=Good Quality Fast Encoding. The encoder balances quality with the
+    //     amount of time it takes to encode the output. ( speed setting
+    //     controls how fast )
+    // (2)=One Pass - Best Quality. The encoder places priority on the
+    //     quality of the output over encoding speed. The output is compressed
+    //     at the highest possible quality. This option takes the longest
+    //     amount of time to encode. ( speed setting ignored )
+    // (3)=Two Pass - First Pass. The encoder generates a file of statistics
+    //     for use in the second encoding pass. ( speed setting controls how
+    //     fast )
+    // (4)=Two Pass - Second Pass. The encoder uses the statistics that were
+    //     generated in the first encoding pass to create the compressed
+    //     output. ( speed setting controls how fast )
+    // (5)=Two Pass - Second Pass Best.  The encoder uses the statistics that
+    //     were generated in the first encoding pass to create the compressed
+    //     output using the highest possible quality, and taking a
     //    longer amount of time to encode.. ( speed setting ignored )
-    int Mode;               //
+    int Mode;
 
     // Key Framing Operations
-    int auto_key;            // automatically detect cut scenes and set the keyframes
-    int key_freq;            // maximum distance to key frame.
+    int auto_key;  // autodetect cut scenes and set the keyframes
+    int key_freq;  // maximum distance to key frame.
 
-    int allow_lag;           // allow lagged compression (if 0 lagin frames is ignored)
-    int lag_in_frames;        // how many frames lag before we start encoding
+    int allow_lag;  // allow lagged compression (if 0 lagin frames is ignored)
+    int lag_in_frames;  // how many frames lag before we start encoding
 
     // ----------------------------------------------------------------
     // DATARATE CONTROL OPTIONS
 
-    int end_usage; // vbr or cbr
+    int end_usage;  // vbr or cbr
 
     // buffer targeting aggressiveness
     int under_shoot_pct;
@@ -138,7 +143,7 @@ extern "C"
     int play_alternate;
     int alt_freq;
 
-    int encode_breakout;  // early breakout encode threshold : for video conf recommend 800
+    int encode_breakout;  // early breakout : for video conf recommend 800
 
     /* Bitfield defining the error resiliency features to enable.
      * Can provide decodable frames after losses in previous
@@ -173,8 +178,8 @@ extern "C"
 
   void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf);
 
-// receive a frames worth of data caller can assume that a copy of this frame is made
-// and not just a copy of the pointer..
+  // receive a frames worth of data. caller can assume that a copy of this
+  // frame is made and not just a copy of the pointer..
   int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags,
                             YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                             int64_t end_time_stamp);
@@ -216,8 +221,6 @@ extern "C"
   int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
                            unsigned int height);
 
-  int vp9_switch_layer(VP9_PTR comp, int layer);
-
   void vp9_set_svc(VP9_PTR comp, int use_svc);
 
   int vp9_get_quantizer(VP9_PTR c);
diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h
index 0431e14..a2af57a 100644
--- a/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libvpx/vp9/common/vp9_onyxc_int.h
@@ -11,14 +11,15 @@
 #ifndef VP9_COMMON_VP9_ONYXC_INT_H_
 #define VP9_COMMON_VP9_ONYXC_INT_H_
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_tile_common.h"
 
 #if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
@@ -40,10 +41,9 @@
 typedef struct frame_contexts {
   vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
   vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-  vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
-                         [PARTITION_TYPES - 1];
+  vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
   vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
-  vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
+  vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                  [SWITCHABLE_FILTERS - 1];
   vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
   vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
@@ -58,11 +58,11 @@ typedef struct frame_contexts {
 typedef struct {
   unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
   unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
-  unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
+  unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
   vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
   unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
                          [COEF_BANDS][PREV_COEF_CONTEXTS];
-  unsigned int switchable_interp[SWITCHABLE_FILTERS + 1]
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
                                 [SWITCHABLE_FILTERS];
   unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
   unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
@@ -91,6 +91,8 @@ typedef struct VP9Common {
   DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]);
 #endif
 
+  COLOR_SPACE color_space;
+
   int width;
   int height;
   int display_width;
@@ -116,11 +118,12 @@ typedef struct VP9Common {
   // Each frame can reference ALLOWED_REFS_PER_FRAME buffers
   int active_ref_idx[ALLOWED_REFS_PER_FRAME];
   struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME];
+  struct scale_factors_common active_ref_scale_comm[ALLOWED_REFS_PER_FRAME];
   int new_fb_idx;
 
   YV12_BUFFER_CONFIG post_proc_buffer;
 
-  FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
+  FRAME_TYPE last_frame_type;  /* last frame's frame type for motion search.*/
   FRAME_TYPE frame_type;
 
   int show_frame;
@@ -129,6 +132,8 @@ typedef struct VP9Common {
   // Flag signaling that the frame is encoded using only INTRA modes.
   int intra_only;
 
+  int allow_high_precision_mv;
+
   // Flag signaling that the frame context should be reset to default values.
   // 0 or 1 implies don't reset, 2 reset just the context specified in the
   // frame header, 3 reset all contexts.
@@ -146,8 +151,6 @@ typedef struct VP9Common {
   TX_MODE tx_mode;
 
   int base_qindex;
-  int last_kf_gf_q;  /* Q used on the last GF or KF */
-
   int y_dc_delta_q;
   int uv_dc_delta_q;
   int uv_ac_delta_q;
@@ -172,7 +175,7 @@ typedef struct VP9Common {
   // Persistent mb segment id map used in prediction.
   unsigned char *last_frame_seg_map;
 
-  INTERPOLATIONFILTERTYPE mcomp_filter_type;
+  INTERPOLATION_TYPE mcomp_filter_type;
 
   loop_filter_info_n lf_info;
 
@@ -183,14 +186,6 @@ typedef struct VP9Common {
   struct loopfilter lf;
   struct segmentation seg;
 
-  /* Y,U,V */
-  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-
-  // partition contexts
-  PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[8];
-
   // Context probabilities for reference frame prediction
   int allow_comp_inter_inter;
   MV_REFERENCE_FRAME comp_fixed_ref;
@@ -213,10 +208,19 @@ typedef struct VP9Common {
   int frame_parallel_decoding_mode;
 
   int log2_tile_cols, log2_tile_rows;
-  int cur_tile_mi_col_start, cur_tile_mi_col_end;
-  int cur_tile_mi_row_start, cur_tile_mi_row_end;
 } VP9_COMMON;
 
+// ref == 0 => LAST_FRAME
+// ref == 1 => GOLDEN_FRAME
+// ref == 2 => ALTREF_FRAME
+static YV12_BUFFER_CONFIG *get_frame_ref_buffer(VP9_COMMON *cm, int ref) {
+  return &cm->yv12_fb[cm->active_ref_idx[ref]];
+}
+
+static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
+  return &cm->yv12_fb[cm->new_fb_idx];
+}
+
 static int get_free_fb(VP9_COMMON *cm) {
   int i;
   for (i = 0; i < NUM_YV12_BUFFERS; i++)
@@ -241,58 +245,38 @@ static int mi_cols_aligned_to_sb(int n_mis) {
   return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
 }
 
-static INLINE void set_skip_context(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col) {
+static INLINE const vp9_prob* get_partition_probs(VP9_COMMON *cm, int ctx) {
+  return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx]
+                                     : cm->fc.partition_prob[ctx];
+}
+
+static INLINE void set_skip_context(
+    MACROBLOCKD *xd,
+    ENTROPY_CONTEXT *above_context[MAX_MB_PLANE],
+    ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16],
+    int mi_row, int mi_col) {
   const int above_idx = mi_col * 2;
   const int left_idx = (mi_row * 2) & 15;
   int i;
   for (i = 0; i < MAX_MB_PLANE; i++) {
     struct macroblockd_plane *const pd = &xd->plane[i];
-    pd->above_context = cm->above_context[i] + (above_idx >> pd->subsampling_x);
-    pd->left_context = cm->left_context[i] + (left_idx >> pd->subsampling_y);
+    pd->above_context = above_context[i] + (above_idx >> pd->subsampling_x);
+    pd->left_context = left_context[i] + (left_idx >> pd->subsampling_y);
   }
 }
 
-static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                             int mi_row, int mi_col) {
-  xd->above_seg_context = cm->above_seg_context + mi_col;
-  xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
-}
-
-// return the node index in the prob tree for binary coding
-static int check_bsize_coverage(int bs, int mi_rows, int mi_cols,
-                                int mi_row, int mi_col) {
-  const int r = (mi_row + bs < mi_rows);
-  const int c = (mi_col + bs < mi_cols);
-
-  if (r && c)
-    return 0;
-
-  if (c && !r)
-    return 1;  // only allow horizontal/split partition types
-
-  if (r && !c)
-    return 2;  // only allow vertical/split partition types
-
-  return -1;
-}
-
-static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,
-                       int mi_row, int bh,
-                       int mi_col, int bw) {
-  xd->mb_to_top_edge    = -((mi_row * MI_SIZE) << 3);
-  xd->mb_to_bottom_edge = ((cm->mi_rows - bh - mi_row) * MI_SIZE) << 3;
-  xd->mb_to_left_edge   = -((mi_col * MI_SIZE) << 3);
-  xd->mb_to_right_edge  = ((cm->mi_cols - bw - mi_col) * MI_SIZE) << 3;
+static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+                           int mi_row, int bh,
+                           int mi_col, int bw,
+                           int mi_rows, int mi_cols) {
+  xd->mb_to_top_edge    = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge   = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge  = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
 
   // Are edges available for intra prediction?
   xd->up_available    = (mi_row != 0);
-  xd->left_available  = (mi_col > cm->cur_tile_mi_col_start);
-  xd->right_available = (mi_col + bw < cm->cur_tile_mi_col_end);
-}
-
-static int get_token_alloc(int mb_rows, int mb_cols) {
-  return mb_rows * mb_cols * (48 * 16 + 4);
+  xd->left_available  = (mi_col > tile->mi_col_start);
 }
 
 static void set_prev_mi(VP9_COMMON *cm) {
@@ -306,4 +290,62 @@ static void set_prev_mi(VP9_COMMON *cm) {
   cm->prev_mi = use_prev_in_find_mv_refs ?
                   cm->prev_mip + cm->mode_info_stride + 1 : NULL;
 }
+
+static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME || cm->intra_only;
+}
+
+static INLINE void update_partition_context(
+    PARTITION_CONTEXT *above_seg_context,
+    PARTITION_CONTEXT left_seg_context[8],
+    int mi_row, int mi_col,
+    BLOCK_SIZE sb_type,
+    BLOCK_SIZE sb_size) {
+  PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
+  PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
+
+  const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
+  const int bwl = b_width_log2(sb_type);
+  const int bhl = b_height_log2(sb_type);
+  const int boffset = b_width_log2(BLOCK_64X64) - bsl;
+  const char pcval0 = ~(0xe << boffset);
+  const char pcval1 = ~(0xf << boffset);
+  const char pcvalue[2] = {pcval0, pcval1};
+
+  assert(MAX(bwl, bhl) <= bsl);
+
+  // update the partition context at the end notes. set partition bits
+  // of block sizes larger than the current one to be one, and partition
+  // bits of smaller block sizes to be zero.
+  vpx_memset(above_ctx, pcvalue[bwl == bsl], bs);
+  vpx_memset(left_ctx, pcvalue[bhl == bsl], bs);
+}
+
+static INLINE int partition_plane_context(
+    const PARTITION_CONTEXT *above_seg_context,
+    const PARTITION_CONTEXT left_seg_context[8],
+    int mi_row, int mi_col,
+    BLOCK_SIZE sb_type) {
+  const PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
+
+  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
+  int above = 0, left = 0, i;
+  int boffset = mi_width_log2(BLOCK_64X64) - bsl;
+
+  assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
+  assert(bsl >= 0);
+  assert(boffset >= 0);
+
+  for (i = 0; i < bs; i++)
+    above |= (above_ctx[i] & (1 << boffset));
+  for (i = 0; i < bs; i++)
+    left |= (left_ctx[i] & (1 << boffset));
+
+  above = (above > 0);
+  left  = (left > 0);
+
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
 #endif  // VP9_COMMON_VP9_ONYXC_INT_H_
diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c
index 955e676..212a28a 100644
--- a/libvpx/vp9/common/vp9_postproc.c
+++ b/libvpx/vp9/common/vp9_postproc.c
@@ -8,6 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
 
 #include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
@@ -18,11 +21,6 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 
-
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-
 #define RGB_TO_YUV(t)                                            \
   ( (0.257*(float)(t >> 16))  + (0.504*(float)(t >> 8 & 0xff)) + \
     (0.098*(float)(t & 0xff)) + 16),                             \
@@ -155,7 +153,6 @@ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
     p_dst = dst_ptr;
 
     for (col = 0; col < cols; col++) {
-
       int kernel = 4;
       int v = p_src[col];
 
@@ -257,7 +254,7 @@ void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
 void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
                             int rows, int cols, int flimit) {
   int r, c, i;
-  const short *rv3 = &vp9_rv[63 & rand()];
+  const short *rv3 = &vp9_rv[63 & rand()]; // NOLINT
 
   for (c = 0; c < cols; c++) {
     uint8_t *s = &dst[c];
@@ -408,7 +405,6 @@ static void fillrd(struct postproc_state *state, int q, int a) {
 
         next = next + j;
       }
-
     }
 
     for (; next < 256; next++)
@@ -416,7 +412,7 @@ static void fillrd(struct postproc_state *state, int q, int a) {
   }
 
   for (i = 0; i < 3072; i++) {
-    state->noise[i] = char_dist[rand() & 0xff];
+    state->noise[i] = char_dist[rand() & 0xff];  // NOLINT
   }
 
   for (i = 0; i < 16; i++) {
@@ -680,13 +676,14 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 #if 0 && CONFIG_POSTPROC_VISUALIZER
   if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
     char message[512];
-    sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
-            (cm->frame_type == KEY_FRAME),
-            cm->refresh_golden_frame,
-            cm->base_qindex,
-            cm->filter_level,
-            flags,
-            cm->mb_cols, cm->mb_rows);
+    snprintf(message, sizeof(message) -1,
+             "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
+             (cm->frame_type == KEY_FRAME),
+             cm->refresh_golden_frame,
+             cm->base_qindex,
+             cm->filter_level,
+             flags,
+             cm->mb_cols, cm->mb_rows);
     vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
                   cm->post_proc_buffer.y_stride);
   }
@@ -707,7 +704,7 @@ int vp9_post_proc_frame(struct VP9Common *cm,
       for (j = 0; j < mb_cols; j++) {
         char zz[4];
 
-        sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
+        snprintf(zz, sizeof(zz) - 1, "%c", mi[mb_index].mbmi.mode + 'a');
 
         vp9_blit_text(zz, y_ptr, post->y_stride);
         mb_index++;
@@ -716,7 +713,6 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 
       mb_index++; /* border */
       y_ptr += post->y_stride  * 16 - post->y_width;
-
     }
   }
 
@@ -740,9 +736,9 @@ int vp9_post_proc_frame(struct VP9Common *cm,
                         mi[mb_index].mbmi.skip_coeff);
 
         if (cm->frame_type == KEY_FRAME)
-          sprintf(zz, "a");
+          snprintf(zz, sizeof(zz) - 1, "a");
         else
-          sprintf(zz, "%c", dc_diff + '0');
+          snprintf(zz, sizeof(zz) - 1, "%c", dc_diff + '0');
 
         vp9_blit_text(zz, y_ptr, post->y_stride);
         mb_index++;
@@ -751,7 +747,6 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 
       mb_index++; /* border */
       y_ptr += post->y_stride  * 16 - post->y_width;
-
     }
   }
 
@@ -894,8 +889,9 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 
             constrain_line(lx0, &x1, ly0 + 1, &y1, width, height);
             vp9_blit_line(lx0,  x1, ly0 + 1,  y1, y_buffer, y_stride);
-          } else
+          } else {
             vp9_blit_line(lx0,  x1, ly0,  y1, y_buffer, y_stride);
+          }
         }
 
         mi++;
diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c
index 81fbf1f..6018e17 100644
--- a/libvpx/vp9/common/vp9_pred_common.c
+++ b/libvpx/vp9/common/vp9_pred_common.c
@@ -16,25 +16,33 @@
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_treecoder.h"
 
+static INLINE const MB_MODE_INFO *get_above_mbmi(const MODE_INFO *const above) {
+  return (above != NULL) ? &above->mbmi : NULL;
+}
+
+static INLINE const MB_MODE_INFO *get_left_mbmi(const MODE_INFO *const left) {
+  return (left != NULL) ? &left->mbmi : NULL;
+}
+
 // Returns a context number for the given MB prediction signal
 unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
+  const MODE_INFO *const above_mi = get_above_mi(xd);
+  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const int above_in_image = above_mi != NULL;
+  const int left_in_image = left_mi != NULL;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
   // left
-  const int left_mv_pred = left_in_image ? is_inter_mode(left_mi->mbmi.mode)
+  const int left_mv_pred = left_in_image ? is_inter_block(&left_mi->mbmi)
                                          : 0;
   const int left_interp = left_in_image && left_mv_pred
                               ? left_mi->mbmi.interp_filter
                               : SWITCHABLE_FILTERS;
 
   // above
-  const int above_mv_pred = above_in_image ? is_inter_mode(above_mi->mbmi.mode)
+  const int above_mv_pred = above_in_image ? is_inter_block(&above_mi->mbmi)
                                            : 0;
   const int above_interp = above_in_image && above_mv_pred
                                ? above_mi->mbmi.interp_filter
@@ -53,14 +61,14 @@ unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
 }
 // Returns a context number for the given MB prediction signal
 unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
-  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const MODE_INFO *const above_mi = get_above_mi(xd);
+  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
+  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
+  const int above_in_image = above_mi != NULL;
+  const int left_in_image = left_mi != NULL;
   const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
+  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
 
   // The mode info data structure has a one element border above and to the
   // left of the entries corresponding to real macroblocks.
@@ -81,12 +89,12 @@ unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
 unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
                                                     const MACROBLOCKD *xd) {
   int pred_context;
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
-  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
+  const MODE_INFO *const above_mi = get_above_mi(xd);
+  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
+  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
+  const int above_in_image = above_mi != NULL;
+  const int left_in_image = left_mi != NULL;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
@@ -126,14 +134,14 @@ unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
 unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
                                               const MACROBLOCKD *xd) {
   int pred_context;
-  const MODE_INFO * const above_mi = xd->mi_8x8[-cm->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
-  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const MODE_INFO *const above_mi = get_above_mi(xd);
+  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
+  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
+  const int above_in_image = above_mi != NULL;
+  const int left_in_image = left_mi != NULL;
   const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
+  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
@@ -206,14 +214,14 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
 }
 unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
   int pred_context;
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
-  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const MODE_INFO *const above_mi = get_above_mi(xd);
+  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
+  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
+  const int above_in_image = above_mi != NULL;
+  const int left_in_image = left_mi != NULL;
   const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
+  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
@@ -272,14 +280,14 @@ unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
 
 unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
   int pred_context;
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
-  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const MODE_INFO *const above_mi = get_above_mi(xd);
+  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
+  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
+  const int above_in_image = above_mi != NULL;
+  const int left_in_image = left_mi != NULL;
   const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
+  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
 
   // Note:
   // The mode info data structure has a one element border above and to the
@@ -361,12 +369,12 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
 // left of the entries corresponding to real blocks.
 // The prediction flags in these dummy entries are initialized to 0.
 unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
-  const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
-  const int left_in_image = xd->left_available && left_mi;
-  const int above_in_image = xd->up_available && above_mi;
+  const MODE_INFO *const above_mi = get_above_mi(xd);
+  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
+  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
+  const int above_in_image = above_mi != NULL;
+  const int left_in_image = left_mi != NULL;
   const int max_tx_size = max_txsize_lookup[xd->mi_8x8[0]->mbmi.sb_type];
   int above_context = max_tx_size;
   int left_context = max_tx_size;
@@ -389,19 +397,14 @@ unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
 }
 
 void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) {
-  xd->this_mi->mbmi.seg_id_predicted = pred_flag;
-}
-
-void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                              uint8_t pred_flag) {
-  xd->this_mi->mbmi.skip_coeff = pred_flag;
+  xd->mi_8x8[0]->mbmi.seg_id_predicted = pred_flag;
 }
 
 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
                        BLOCK_SIZE bsize, int mi_row, int mi_col) {
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
   const int ymis = MIN(cm->mi_rows - mi_row, bh);
   int x, y, segment_id = INT_MAX;
diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h
index 47ca8ab..19032bf 100644
--- a/libvpx/vp9/common/vp9_pred_common.h
+++ b/libvpx/vp9/common/vp9_pred_common.h
@@ -14,17 +14,25 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
+static INLINE const MODE_INFO *get_above_mi(const MACROBLOCKD *const xd) {
+  return xd->up_available ? xd->mi_8x8[-xd->mode_info_stride] : NULL;
+}
+
+static INLINE const MODE_INFO *get_left_mi(const MACROBLOCKD *const xd) {
+  return xd->left_available ? xd->mi_8x8[-1] : NULL;
+}
+
 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
                        BLOCK_SIZE bsize, int mi_row, int mi_col);
 
-
 static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const int above_sip = above_mi ? above_mi->mbmi.seg_id_predicted : 0;
-  const int left_sip = left_mi ? left_mi->mbmi.seg_id_predicted : 0;
+  const MODE_INFO *const above_mi = get_above_mi(xd);
+  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const int above_sip = (above_mi != NULL) ?
+                        above_mi->mbmi.seg_id_predicted : 0;
+  const int left_sip = (left_mi != NULL) ? left_mi->mbmi.seg_id_predicted : 0;
 
-  return above_sip + (xd->left_available ? left_sip : 0);
+  return above_sip + left_sip;
 }
 
 static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg,
@@ -35,12 +43,13 @@ static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg,
 void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag);
 
 static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) {
-  const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO * const left_mi = xd->mi_8x8[-1];
-  const int above_skip_coeff = above_mi ? above_mi->mbmi.skip_coeff : 0;
-  const int left_skip_coeff = left_mi ? left_mi->mbmi.skip_coeff : 0;
+  const MODE_INFO *const above_mi = get_above_mi(xd);
+  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const int above_skip_coeff = (above_mi != NULL) ?
+                               above_mi->mbmi.skip_coeff : 0;
+  const int left_skip_coeff = (left_mi != NULL) ? left_mi->mbmi.skip_coeff : 0;
 
-  return above_skip_coeff + (xd->left_available ? left_skip_coeff : 0);
+  return above_skip_coeff + left_skip_coeff;
 }
 
 static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
@@ -49,12 +58,9 @@ static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
 }
 
 static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) {
-  return xd->this_mi->mbmi.skip_coeff;
+  return xd->mi_8x8[0]->mbmi.skip_coeff;
 }
 
-void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                              uint8_t pred_flag);
-
 unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
 
 unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd);
@@ -69,8 +75,9 @@ unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
                                                     const MACROBLOCKD *xd);
 
 
-static INLINE vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm,
-                                                          const MACROBLOCKD *xd) {
+static INLINE
+vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm,
+                                            const MACROBLOCKD *xd) {
   const int pred_context = vp9_get_pred_context_comp_inter_inter(cm, xd);
   return cm->fc.comp_inter_prob[pred_context];
 }
@@ -120,14 +127,14 @@ static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
   return get_tx_probs(bsize, context, tx_probs);
 }
 
-static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context,
-                             TX_SIZE tx_size, struct tx_counts *tx_counts) {
-  if (bsize >= BLOCK_32X32)
-    tx_counts->p32x32[context][tx_size]++;
-  else if (bsize >= BLOCK_16X16)
-    tx_counts->p16x16[context][tx_size]++;
+static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context,
+                                   struct tx_counts *tx_counts) {
+  if (bsize < BLOCK_16X16)
+    return tx_counts->p8x8[context];
+  else if (bsize < BLOCK_32X32)
+    return tx_counts->p16x16[context];
   else
-    tx_counts->p8x8[context][tx_size]++;
+    return tx_counts->p32x32[context];
 }
 
 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_quant_common.c b/libvpx/vp9/common/vp9_quant_common.c
index bc40854..6dbdb42 100644
--- a/libvpx/vp9/common/vp9_quant_common.c
+++ b/libvpx/vp9/common/vp9_quant_common.c
@@ -14,69 +14,69 @@
 
 #if 1
 static const int16_t dc_qlookup[QINDEX_RANGE] = {
-     4,    8,    8,    9,   10,   11,   12,   12,
-    13,   14,   15,   16,   17,   18,   19,   19,
-    20,   21,   22,   23,   24,   25,   26,   26,
-    27,   28,   29,   30,   31,   32,   32,   33,
-    34,   35,   36,   37,   38,   38,   39,   40,
-    41,   42,   43,   43,   44,   45,   46,   47,
-    48,   48,   49,   50,   51,   52,   53,   53,
-    54,   55,   56,   57,   57,   58,   59,   60,
-    61,   62,   62,   63,   64,   65,   66,   66,
-    67,   68,   69,   70,   70,   71,   72,   73,
-    74,   74,   75,   76,   77,   78,   78,   79,
-    80,   81,   81,   82,   83,   84,   85,   85,
-    87,   88,   90,   92,   93,   95,   96,   98,
-    99,  101,  102,  104,  105,  107,  108,  110,
-   111,  113,  114,  116,  117,  118,  120,  121,
-   123,  125,  127,  129,  131,  134,  136,  138,
-   140,  142,  144,  146,  148,  150,  152,  154,
-   156,  158,  161,  164,  166,  169,  172,  174,
-   177,  180,  182,  185,  187,  190,  192,  195,
-   199,  202,  205,  208,  211,  214,  217,  220,
-   223,  226,  230,  233,  237,  240,  243,  247,
-   250,  253,  257,  261,  265,  269,  272,  276,
-   280,  284,  288,  292,  296,  300,  304,  309,
-   313,  317,  322,  326,  330,  335,  340,  344,
-   349,  354,  359,  364,  369,  374,  379,  384,
-   389,  395,  400,  406,  411,  417,  423,  429,
-   435,  441,  447,  454,  461,  467,  475,  482,
-   489,  497,  505,  513,  522,  530,  539,  549,
-   559,  569,  579,  590,  602,  614,  626,  640,
-   654,  668,  684,  700,  717,  736,  755,  775,
-   796,  819,  843,  869,  896,  925,  955,  988,
+  4,       8,    8,    9,   10,   11,   12,   12,
+  13,     14,   15,   16,   17,   18,   19,   19,
+  20,     21,   22,   23,   24,   25,   26,   26,
+  27,     28,   29,   30,   31,   32,   32,   33,
+  34,     35,   36,   37,   38,   38,   39,   40,
+  41,     42,   43,   43,   44,   45,   46,   47,
+  48,     48,   49,   50,   51,   52,   53,   53,
+  54,     55,   56,   57,   57,   58,   59,   60,
+  61,     62,   62,   63,   64,   65,   66,   66,
+  67,     68,   69,   70,   70,   71,   72,   73,
+  74,     74,   75,   76,   77,   78,   78,   79,
+  80,     81,   81,   82,   83,   84,   85,   85,
+  87,     88,   90,   92,   93,   95,   96,   98,
+  99,    101,  102,  104,  105,  107,  108,  110,
+  111,   113,  114,  116,  117,  118,  120,  121,
+  123,   125,  127,  129,  131,  134,  136,  138,
+  140,   142,  144,  146,  148,  150,  152,  154,
+  156,   158,  161,  164,  166,  169,  172,  174,
+  177,   180,  182,  185,  187,  190,  192,  195,
+  199,   202,  205,  208,  211,  214,  217,  220,
+  223,   226,  230,  233,  237,  240,  243,  247,
+  250,   253,  257,  261,  265,  269,  272,  276,
+  280,   284,  288,  292,  296,  300,  304,  309,
+  313,   317,  322,  326,  330,  335,  340,  344,
+  349,   354,  359,  364,  369,  374,  379,  384,
+  389,   395,  400,  406,  411,  417,  423,  429,
+  435,   441,  447,  454,  461,  467,  475,  482,
+  489,   497,  505,  513,  522,  530,  539,  549,
+  559,   569,  579,  590,  602,  614,  626,  640,
+  654,   668,  684,  700,  717,  736,  755,  775,
+  796,   819,  843,  869,  896,  925,  955,  988,
   1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
 };
 
 static const int16_t ac_qlookup[QINDEX_RANGE] = {
-     4,    8,    9,   10,   11,   12,   13,   14,
-    15,   16,   17,   18,   19,   20,   21,   22,
-    23,   24,   25,   26,   27,   28,   29,   30,
-    31,   32,   33,   34,   35,   36,   37,   38,
-    39,   40,   41,   42,   43,   44,   45,   46,
-    47,   48,   49,   50,   51,   52,   53,   54,
-    55,   56,   57,   58,   59,   60,   61,   62,
-    63,   64,   65,   66,   67,   68,   69,   70,
-    71,   72,   73,   74,   75,   76,   77,   78,
-    79,   80,   81,   82,   83,   84,   85,   86,
-    87,   88,   89,   90,   91,   92,   93,   94,
-    95,   96,   97,   98,   99,  100,  101,  102,
-   104,  106,  108,  110,  112,  114,  116,  118,
-   120,  122,  124,  126,  128,  130,  132,  134,
-   136,  138,  140,  142,  144,  146,  148,  150,
-   152,  155,  158,  161,  164,  167,  170,  173,
-   176,  179,  182,  185,  188,  191,  194,  197,
-   200,  203,  207,  211,  215,  219,  223,  227,
-   231,  235,  239,  243,  247,  251,  255,  260,
-   265,  270,  275,  280,  285,  290,  295,  300,
-   305,  311,  317,  323,  329,  335,  341,  347,
-   353,  359,  366,  373,  380,  387,  394,  401,
-   408,  416,  424,  432,  440,  448,  456,  465,
-   474,  483,  492,  501,  510,  520,  530,  540,
-   550,  560,  571,  582,  593,  604,  615,  627,
-   639,  651,  663,  676,  689,  702,  715,  729,
-   743,  757,  771,  786,  801,  816,  832,  848,
-   864,  881,  898,  915,  933,  951,  969,  988,
+  4,       8,    9,   10,   11,   12,   13,   14,
+  15,     16,   17,   18,   19,   20,   21,   22,
+  23,     24,   25,   26,   27,   28,   29,   30,
+  31,     32,   33,   34,   35,   36,   37,   38,
+  39,     40,   41,   42,   43,   44,   45,   46,
+  47,     48,   49,   50,   51,   52,   53,   54,
+  55,     56,   57,   58,   59,   60,   61,   62,
+  63,     64,   65,   66,   67,   68,   69,   70,
+  71,     72,   73,   74,   75,   76,   77,   78,
+  79,     80,   81,   82,   83,   84,   85,   86,
+  87,     88,   89,   90,   91,   92,   93,   94,
+  95,     96,   97,   98,   99,  100,  101,  102,
+  104,   106,  108,  110,  112,  114,  116,  118,
+  120,   122,  124,  126,  128,  130,  132,  134,
+  136,   138,  140,  142,  144,  146,  148,  150,
+  152,   155,  158,  161,  164,  167,  170,  173,
+  176,   179,  182,  185,  188,  191,  194,  197,
+  200,   203,  207,  211,  215,  219,  223,  227,
+  231,   235,  239,  243,  247,  251,  255,  260,
+  265,   270,  275,  280,  285,  290,  295,  300,
+  305,   311,  317,  323,  329,  335,  341,  347,
+  353,   359,  366,  373,  380,  387,  394,  401,
+  408,   416,  424,  432,  440,  448,  456,  465,
+  474,   483,  492,  501,  510,  520,  530,  540,
+  550,   560,  571,  582,  593,  604,  615,  627,
+  639,   651,  663,  676,  689,  702,  715,  729,
+  743,   757,  771,  786,  801,  816,  832,  848,
+  864,   881,  898,  915,  933,  951,  969,  988,
   1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151,
   1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
   1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567,
diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c
index dc1d46c..1c96788 100644
--- a/libvpx/vp9/common/vp9_reconinter.c
+++ b/libvpx/vp9/common/vp9_reconinter.c
@@ -20,37 +20,44 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
-
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                              INTERPOLATIONFILTERTYPE mcomp_filter_type,
+                              INTERPOLATION_TYPE mcomp_filter_type,
                               VP9_COMMON *cm) {
-  if (xd->mi_8x8 && xd->this_mi) {
-    MB_MODE_INFO * mbmi = &xd->this_mi->mbmi;
+  if (xd->mi_8x8 && xd->mi_8x8[0]) {
+    MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
 
-    set_scale_factors(xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1,
-                      cm->active_ref_scale);
+    set_scale_factors(xd, mbmi->ref_frame[0] - LAST_FRAME,
+                          mbmi->ref_frame[1] - LAST_FRAME,
+                          cm->active_ref_scale);
   } else {
     set_scale_factors(xd, -1, -1, cm->active_ref_scale);
   }
 
-  switch (mcomp_filter_type) {
-    case EIGHTTAP:
-    case SWITCHABLE:
-      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8;
-      break;
-    case EIGHTTAP_SMOOTH:
-      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp;
-      break;
-    case EIGHTTAP_SHARP:
-      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s;
-      break;
-    case BILINEAR:
-      xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters;
-      break;
-  }
+  xd->subpix.filter_x = xd->subpix.filter_y =
+      vp9_get_filter_kernel(mcomp_filter_type == SWITCHABLE ?
+                               EIGHTTAP : mcomp_filter_type);
+
   assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
 }
 
+static void inter_predictor(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            const MV32 *mv,
+                            const struct scale_factors *scale,
+                            int w, int h, int ref,
+                            const struct subpix_fn_table *subpix,
+                            int xs, int ys) {
+  const int subpel_x = mv->col & SUBPEL_MASK;
+  const int subpel_y = mv->row & SUBPEL_MASK;
+
+  src += (mv->row >> SUBPEL_BITS) * src_stride + (mv->col >> SUBPEL_BITS);
+  scale->sfc->predict[subpel_x != 0][subpel_y != 0][ref](
+      src, src_stride, dst, dst_stride,
+      subpix->filter_x[subpel_x], xs,
+      subpix->filter_y[subpel_y], ys,
+      w, h);
+}
+
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *src_mv,
@@ -59,18 +66,13 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                const struct subpix_fn_table *subpix,
                                enum mv_precision precision) {
   const int is_q4 = precision == MV_PRECISION_Q4;
-  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row << 1,
-                     is_q4 ? src_mv->col : src_mv->col << 1 };
-  const MV32 mv = scale->scale_mv(&mv_q4, scale);
-  const int subpel_x = mv.col & SUBPEL_MASK;
-  const int subpel_y = mv.row & SUBPEL_MASK;
-
-  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
-  scale->predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride,
-      subpix->filter_x[subpel_x], scale->x_step_q4,
-      subpix->filter_y[subpel_y], scale->y_step_q4,
-      w, h);
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+                     is_q4 ? src_mv->col : src_mv->col * 2 };
+  const struct scale_factors_common *sfc = scale->sfc;
+  const MV32 mv = sfc->scale_mv(&mv_q4, scale);
+
+  inter_predictor(src, src_stride, dst, dst_stride, &mv, scale,
+                  w, h, ref, subpix, sfc->x_step_q4, sfc->y_step_q4);
 }
 
 static INLINE int round_mv_comp_q4(int value) {
@@ -100,16 +102,17 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
   const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS;
   const int spel_bottom = spel_top - SUBPEL_SHIFTS;
   MV clamped_mv = {
-    src_mv->row << (1 - ss_y),
-    src_mv->col << (1 - ss_x)
+    src_mv->row * (1 << (1 - ss_y)),
+    src_mv->col * (1 << (1 - ss_x))
   };
   assert(ss_x <= 1);
   assert(ss_y <= 1);
 
-  clamp_mv(&clamped_mv, (xd->mb_to_left_edge << (1 - ss_x)) - spel_left,
-                        (xd->mb_to_right_edge << (1 - ss_x)) + spel_right,
-                        (xd->mb_to_top_edge << (1 - ss_y)) - spel_top,
-                        (xd->mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
+  clamp_mv(&clamped_mv,
+           xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
+           xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
+           xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
+           xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom);
 
   return clamped_mv;
 }
@@ -130,8 +133,8 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
   const int bh = plane_block_height(bsize, pd);
   const int x = 4 * (block & ((1 << bwl) - 1));
   const int y = 4 * (block >> bwl);
-  const MODE_INFO *mi = xd->this_mi;
-  const int use_second_ref = mi->mbmi.ref_frame[1] > 0;
+  const MODE_INFO *mi = xd->mi_8x8[0];
+  const int is_compound = has_second_ref(&mi->mbmi);
   int ref;
 
   assert(x < bw);
@@ -139,14 +142,10 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
   assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw);
   assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh);
 
-  for (ref = 0; ref < 1 + use_second_ref; ++ref) {
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
     struct scale_factors *const scale = &xd->scale_factor[ref];
     struct buf_2d *const pre_buf = &pd->pre[ref];
     struct buf_2d *const dst_buf = &pd->dst;
-
-    const uint8_t *const pre = pre_buf->buf + scaled_buffer_offset(x, y,
-                               pre_buf->stride, scale);
-
     uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
 
     // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
@@ -162,15 +161,32 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
     // scaling case. It needs to be done on the scaled MV, not the pre-scaling
     // MV. Note however that it performs the subsampling aware scaling so
     // that the result is always q4.
-    const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
-                                                pd->subsampling_x,
-                                                pd->subsampling_y);
-
-    scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
-    vp9_build_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                              &res_mv, scale,
-                              4 << pred_w, 4 << pred_h, ref,
-                              &xd->subpix, MV_PRECISION_Q4);
+    // mv_precision precision is MV_PRECISION_Q4.
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+
+    uint8_t *pre;
+    MV32 scaled_mv;
+    int xs, ys;
+
+    if (vp9_is_scaled(scale->sfc)) {
+      pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale);
+      scale->sfc->set_scaled_offsets(scale, arg->y + y, arg->x + x);
+      scaled_mv = scale->sfc->scale_mv(&mv_q4, scale);
+      xs = scale->sfc->x_step_q4;
+      ys = scale->sfc->y_step_q4;
+    } else {
+      pre = pre_buf->buf + (y * pre_buf->stride + x);
+      scaled_mv.row = mv_q4.row;
+      scaled_mv.col = mv_q4.col;
+      xs = ys = 16;
+    }
+
+    inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                    &scaled_mv, scale,
+                    4 << pred_w, 4 << pred_h, ref,
+                    &xd->subpix, xs, ys);
   }
 }
 
@@ -184,36 +200,17 @@ typedef void (*foreach_predicted_block_visitor)(int plane, int block,
 static INLINE void foreach_predicted_block_in_plane(
     const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
     foreach_predicted_block_visitor visit, void *arg) {
-  int i, x, y;
-
-  // block sizes in number of 4x4 blocks log 2 ("*_b")
-  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
-  // subsampled size of the block
   const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
   const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
 
-  // size of the predictor to use.
-  int pred_w, pred_h;
-
-  if (xd->this_mi->mbmi.sb_type < BLOCK_8X8) {
+  if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
+    int i = 0, x, y;
     assert(bsize == BLOCK_8X8);
-    pred_w = 0;
-    pred_h = 0;
+    for (y = 0; y < 1 << bhl; ++y)
+      for (x = 0; x < 1 << bwl; ++x)
+        visit(plane, i++, bsize, 0, 0, arg);
   } else {
-    pred_w = bwl;
-    pred_h = bhl;
-  }
-  assert(pred_w <= bwl);
-  assert(pred_h <= bhl);
-
-  // visit each subblock in raster order
-  i = 0;
-  for (y = 0; y < 1 << bhl; y += 1 << pred_h) {
-    for (x = 0; x < 1 << bwl; x += 1 << pred_w) {
-      visit(plane, i, bsize, pred_w, pred_h, arg);
-      i += 1 << pred_w;
-    }
-    i += (1 << (bwl + pred_h)) - (1 << bwl);
+    visit(plane, 0, bsize, bwl, bhl, arg);
   }
 }
 
@@ -249,15 +246,17 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
 void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
   const int ref = cm->active_ref_idx[i];
   struct scale_factors *const sf = &cm->active_ref_scale[i];
+  struct scale_factors_common *const sfc = &cm->active_ref_scale_comm[i];
   if (ref >= NUM_YV12_BUFFERS) {
     vp9_zero(*sf);
+    vp9_zero(*sfc);
   } else {
     YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref];
-    vp9_setup_scale_factors_for_frame(sf,
+    vp9_setup_scale_factors_for_frame(sf, sfc,
                                       fb->y_crop_width, fb->y_crop_height,
                                       cm->width, cm->height);
 
-    if (vp9_is_scaled(sf))
+    if (vp9_is_scaled(sfc))
       vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y);
   }
 }
diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h
index 504b793..2c8a6e4 100644
--- a/libvpx/vp9/common/vp9_reconinter.h
+++ b/libvpx/vp9/common/vp9_reconinter.h
@@ -25,7 +25,7 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);
 
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                              INTERPOLATIONFILTERTYPE filter,
+                              INTERPOLATION_TYPE filter,
                               VP9_COMMON *cm);
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
@@ -38,8 +38,10 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
 
 static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
                                 const struct scale_factors *scale) {
-  const int x = scale ? scale->scale_value_x(x_offset, scale) : x_offset;
-  const int y = scale ? scale->scale_value_y(y_offset, scale) : y_offset;
+  const int x = scale ? scale->sfc->scale_value_x(x_offset, scale->sfc) :
+      x_offset;
+  const int y = scale ? scale->sfc->scale_value_y(y_offset, scale->sfc) :
+      y_offset;
   return y * stride + x;
 }
 
diff --git a/libvpx/vp9/common/vp9_reconintra.c b/libvpx/vp9/common/vp9_reconintra.c
index 4a451b9..eb643b0 100644
--- a/libvpx/vp9/common/vp9_reconintra.c
+++ b/libvpx/vp9/common/vp9_reconintra.c
@@ -13,7 +13,7 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/vpx_once.h"
 
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_onyxc_int.h"
@@ -369,7 +369,7 @@ static void build_intra_predictors(const uint8_t *ref, int ref_stride,
   }
 }
 
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
                             TX_SIZE tx_size, int mode,
                             const uint8_t *ref, int ref_stride,
                             uint8_t *dst, int dst_stride) {
diff --git a/libvpx/vp9/common/vp9_reconintra.h b/libvpx/vp9/common/vp9_reconintra.h
index e9d0dbf..6e3f55c 100644
--- a/libvpx/vp9/common/vp9_reconintra.h
+++ b/libvpx/vp9/common/vp9_reconintra.h
@@ -14,8 +14,8 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
-                            TX_SIZE tx_size, int mode,
-                            const uint8_t *ref, int ref_stride,
-                            uint8_t *dst, int dst_stride);
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+                             TX_SIZE tx_size, int mode,
+                             const uint8_t *ref, int ref_stride,
+                             uint8_t *dst, int dst_stride);
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/libvpx/vp9/common/vp9_rtcd.c b/libvpx/vp9/common/vp9_rtcd.c
index 72613ae..dc15a84 100644
--- a/libvpx/vp9/common/vp9_rtcd.c
+++ b/libvpx/vp9/common/vp9_rtcd.c
@@ -7,9 +7,9 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #define RTCD_C
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
 void vpx_scale_rtcd(void);
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh
index 042afbb..debec61 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.sh
+++ b/libvpx/vp9/common/vp9_rtcd_defs.sh
@@ -22,38 +22,23 @@ forward_decls vp9_common_forward_decls
 
 # x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
 [ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse &&
-  sse2_x86inc=sse2 && ssse3_x86inc=ssse3
+  sse2_x86inc=sse2 && ssse3_x86inc=ssse3 && avx_x86inc=avx && avx2_x86inc=avx2
 
 # this variable is for functions that are 64 bit only.
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3
-
-#
-# Dequant
-#
-
-prototype void vp9_idct_add_16x16 "int16_t *input, uint8_t *dest, int stride, int eob"
-specialize vp9_idct_add_16x16
-
-prototype void vp9_idct_add_8x8 "int16_t *input, uint8_t *dest, int stride, int eob"
-specialize vp9_idct_add_8x8
-
-prototype void vp9_idct_add "int16_t *input, uint8_t *dest, int stride, int eob"
-specialize vp9_idct_add
-
-prototype void vp9_idct_add_32x32 "int16_t *q, uint8_t *dst, int stride, int eob"
-specialize vp9_idct_add_32x32
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && 
+  ssse3_x86_64=ssse3 && avx_x86_64=avx && avx2_x86_64=avx2
 
 #
 # RECON
 #
 prototype void vp9_d207_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_4x4
+specialize vp9_d207_predictor_4x4 $ssse3_x86inc
 
 prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d45_predictor_4x4 $ssse3_x86inc
 
 prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_4x4
+specialize vp9_d63_predictor_4x4 $ssse3_x86inc
 
 prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_h_predictor_4x4 $ssse3_x86inc
@@ -65,7 +50,7 @@ prototype void vp9_d135_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const u
 specialize vp9_d135_predictor_4x4
 
 prototype void vp9_d153_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_4x4
+specialize vp9_d153_predictor_4x4 $ssse3_x86inc
 
 prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_v_predictor_4x4 $sse_x86inc
@@ -86,13 +71,13 @@ prototype void vp9_dc_128_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const
 specialize vp9_dc_128_predictor_4x4
 
 prototype void vp9_d207_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_8x8
+specialize vp9_d207_predictor_8x8 $ssse3_x86inc
 
 prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d45_predictor_8x8 $ssse3_x86inc
 
 prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_8x8
+specialize vp9_d63_predictor_8x8 $ssse3_x86inc
 
 prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_h_predictor_8x8 $ssse3_x86inc
@@ -104,7 +89,7 @@ prototype void vp9_d135_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const u
 specialize vp9_d135_predictor_8x8
 
 prototype void vp9_d153_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_8x8
+specialize vp9_d153_predictor_8x8 $ssse3_x86inc
 
 prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_v_predictor_8x8 $sse_x86inc
@@ -125,13 +110,13 @@ prototype void vp9_dc_128_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const
 specialize vp9_dc_128_predictor_8x8
 
 prototype void vp9_d207_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_16x16
+specialize vp9_d207_predictor_16x16 $ssse3_x86inc
 
 prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d45_predictor_16x16 $ssse3_x86inc
 
 prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_16x16
+specialize vp9_d63_predictor_16x16 $ssse3_x86inc
 
 prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_h_predictor_16x16 $ssse3_x86inc
@@ -143,7 +128,7 @@ prototype void vp9_d135_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const
 specialize vp9_d135_predictor_16x16
 
 prototype void vp9_d153_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_16x16
+specialize vp9_d153_predictor_16x16 $ssse3_x86inc
 
 prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_v_predictor_16x16 $sse2_x86inc
@@ -164,16 +149,16 @@ prototype void vp9_dc_128_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, con
 specialize vp9_dc_128_predictor_16x16
 
 prototype void vp9_d207_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_32x32
+specialize vp9_d207_predictor_32x32 $ssse3_x86inc
 
 prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d45_predictor_32x32 $ssse3_x86inc
 
 prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_32x32
+specialize vp9_d63_predictor_32x32 $ssse3_x86inc
 
 prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_32x32 $ssse3 x86inc
+specialize vp9_h_predictor_32x32 $ssse3_x86inc
 
 prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d117_predictor_32x32
@@ -202,17 +187,6 @@ specialize vp9_dc_left_predictor_32x32
 prototype void vp9_dc_128_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_128_predictor_32x32
 
-if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
-prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_8x8 sse2 neon
-
-prototype void vp9_add_constant_residual_16x16 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_16x16 sse2 neon
-
-prototype void vp9_add_constant_residual_32x32 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_32x32 sse2 neon
-fi
-
 #
 # Loopfilter
 #
@@ -226,7 +200,7 @@ prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8
 specialize vp9_loop_filter_vertical_edge mmx neon
 
 prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2 neon
+specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon
 
 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mbloop_filter_horizontal_edge sse2 neon
@@ -268,80 +242,81 @@ specialize vp9_blend_b
 # Sub Pixel Filters
 #
 prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_copy $sse2_x86inc neon
+specialize vp9_convolve_copy $sse2_x86inc neon dspr2
 
 prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_avg $sse2_x86inc neon
+specialize vp9_convolve_avg $sse2_x86inc neon dspr2
 
 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 ssse3 neon
+specialize vp9_convolve8 sse2 ssse3 neon dspr2
 
 prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_horiz ssse3 neon
+specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2
 
 prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_vert ssse3 neon
+specialize vp9_convolve8_vert sse2 ssse3 neon dspr2
 
 prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg ssse3 neon
+specialize vp9_convolve8_avg sse2 ssse3 neon dspr2
 
 prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_horiz ssse3 neon
+specialize vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2
 
 prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_vert ssse3 neon
+specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2
 
 #
 # dct
 #
-prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_1_add sse2 neon
+prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct4x4_1_add sse2 neon dspr2
 
-prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_add sse2 neon
+prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct4x4_16_add sse2 neon dspr2
 
-prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_1_add sse2 neon
+prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct8x8_1_add sse2 neon dspr2
 
-prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_add sse2 neon
+prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct8x8_64_add sse2 neon dspr2
 
-prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct10_8x8_add sse2 neon
+prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct8x8_10_add sse2 neon dspr2
 
-prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_1_add sse2 neon
+prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct16x16_1_add sse2 neon dspr2
 
-prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_add sse2 neon
+prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct16x16_256_add sse2 neon dspr2
 
-prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct10_16x16_add sse2 neon
+prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct16x16_10_add sse2 neon dspr2
 
-prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct32x32_add sse2 neon
+prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_1024_add sse2 neon dspr2
 
-prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
-specialize vp9_short_idct1_32x32
+prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_34_add sse2
 
-prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht4x4_add sse2 neon
+prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_1_add sse2 dspr2
 
-prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht8x8_add sse2 neon
+prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_iht4x4_16_add sse2 neon dspr2
 
-prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
-specialize vp9_short_iht16x16_add sse2
+prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_iht8x8_64_add sse2 neon dspr2
+
+prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type"
+specialize vp9_iht16x16_256_add sse2 dspr2
 
-prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
-specialize vp9_idct4_1d sse2
 # dct and add
 
-prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_iwalsh4x4_1_add
+prototype void vp9_iwht4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_iwht4x4_1_add
 
-prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_iwalsh4x4_add
+prototype void vp9_iwht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_iwht4x4_16_add
 
 #
 # Encoder functions below this point.
@@ -697,10 +672,10 @@ specialize vp9_block_error $sse2_x86inc
 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
 specialize vp9_subtract_block $sse2_x86inc
 
-prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
 specialize vp9_quantize_b $ssse3_x86_64
 
-prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b_32x32 "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
 specialize vp9_quantize_b_32x32 $ssse3_x86_64
 
 #
@@ -715,38 +690,32 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
 fi
 
 # fdct functions
-prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
 specialize vp9_short_fht4x4 sse2
 
-prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
 specialize vp9_short_fht8x8 sse2
 
-prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
 specialize vp9_short_fht16x16 sse2
 
-prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct8x8 sse2
-
-prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct4x4 sse2
-
-prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct8x4 sse2
+prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fwht4x4
 
-prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct32x32 sse2
+prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct4x4 sse2
 
-prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct32x32_rd sse2
+prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct8x8 sse2
 
-prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct16x16 sse2
+prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct16x16 sse2
 
-prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_walsh4x4
+prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct32x32 sse2
 
-prototype void vp9_short_walsh8x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_walsh8x4
+prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct32x32_rd sse2
 
 #
 # Motion search
diff --git a/libvpx/vp9/common/vp9_scale.c b/libvpx/vp9/common/vp9_scale.c
index 989206c..3f0994f 100644
--- a/libvpx/vp9/common/vp9_scale.c
+++ b/libvpx/vp9/common/vp9_scale.c
@@ -12,23 +12,23 @@
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_scale.h"
 
-static INLINE int scaled_x(int val, const struct scale_factors *scale) {
-  return val * scale->x_scale_fp >> REF_SCALE_SHIFT;
+static INLINE int scaled_x(int val, const struct scale_factors_common *sfc) {
+  return val * sfc->x_scale_fp >> REF_SCALE_SHIFT;
 }
 
-static INLINE int scaled_y(int val, const struct scale_factors *scale) {
-  return val * scale->y_scale_fp >> REF_SCALE_SHIFT;
+static INLINE int scaled_y(int val, const struct scale_factors_common *sfc) {
+  return val * sfc->y_scale_fp >> REF_SCALE_SHIFT;
 }
 
-static int unscaled_value(int val, const struct scale_factors *scale) {
-  (void) scale;
+static int unscaled_value(int val, const struct scale_factors_common *sfc) {
+  (void) sfc;
   return val;
 }
 
 static MV32 scaled_mv(const MV *mv, const struct scale_factors *scale) {
   const MV32 res = {
-    scaled_y(mv->row, scale) + scale->y_offset_q4,
-    scaled_x(mv->col, scale) + scale->x_offset_q4
+    scaled_y(mv->row, scale->sfc) + scale->y_offset_q4,
+    scaled_x(mv->col, scale->sfc) + scale->x_offset_q4
   };
   return res;
 }
@@ -43,8 +43,8 @@ static MV32 unscaled_mv(const MV *mv, const struct scale_factors *scale) {
 
 static void set_offsets_with_scaling(struct scale_factors *scale,
                                      int row, int col) {
-  scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale) & SUBPEL_MASK;
-  scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale) & SUBPEL_MASK;
+  scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK;
+  scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK;
 }
 
 static void set_offsets_without_scaling(struct scale_factors *scale,
@@ -70,31 +70,30 @@ static int check_scale_factors(int other_w, int other_h,
 }
 
 void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       struct scale_factors_common *scale_comm,
                                        int other_w, int other_h,
                                        int this_w, int this_h) {
   if (!check_scale_factors(other_w, other_h, this_w, this_h)) {
-    scale->x_scale_fp = REF_INVALID_SCALE;
-    scale->y_scale_fp = REF_INVALID_SCALE;
+    scale_comm->x_scale_fp = REF_INVALID_SCALE;
+    scale_comm->y_scale_fp = REF_INVALID_SCALE;
     return;
   }
 
-  scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
-  scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
-  scale->x_step_q4 = scaled_x(16, scale);
-  scale->y_step_q4 = scaled_y(16, scale);
-  scale->x_offset_q4 = 0;  // calculated per block
-  scale->y_offset_q4 = 0;  // calculated per block
+  scale_comm->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+  scale_comm->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+  scale_comm->x_step_q4 = scaled_x(16, scale_comm);
+  scale_comm->y_step_q4 = scaled_y(16, scale_comm);
 
-  if (vp9_is_scaled(scale)) {
-    scale->scale_value_x = scaled_x;
-    scale->scale_value_y = scaled_y;
-    scale->set_scaled_offsets = set_offsets_with_scaling;
-    scale->scale_mv = scaled_mv;
+  if (vp9_is_scaled(scale_comm)) {
+    scale_comm->scale_value_x = scaled_x;
+    scale_comm->scale_value_y = scaled_y;
+    scale_comm->set_scaled_offsets = set_offsets_with_scaling;
+    scale_comm->scale_mv = scaled_mv;
   } else {
-    scale->scale_value_x = unscaled_value;
-    scale->scale_value_y = unscaled_value;
-    scale->set_scaled_offsets = set_offsets_without_scaling;
-    scale->scale_mv = unscaled_mv;
+    scale_comm->scale_value_x = unscaled_value;
+    scale_comm->scale_value_y = unscaled_value;
+    scale_comm->set_scaled_offsets = set_offsets_without_scaling;
+    scale_comm->scale_mv = unscaled_mv;
   }
 
   // TODO(agrange): Investigate the best choice of functions to use here
@@ -103,44 +102,48 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
   // applied in one direction only, and not at all for 0,0, seems to give the
   // best quality, but it may be worth trying an additional mode that does
   // do the filtering on full-pel.
-  if (scale->x_step_q4 == 16) {
-    if (scale->y_step_q4 == 16) {
+  if (scale_comm->x_step_q4 == 16) {
+    if (scale_comm->y_step_q4 == 16) {
       // No scaling in either direction.
-      scale->predict[0][0][0] = vp9_convolve_copy;
-      scale->predict[0][0][1] = vp9_convolve_avg;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+      scale_comm->predict[0][0][0] = vp9_convolve_copy;
+      scale_comm->predict[0][0][1] = vp9_convolve_avg;
+      scale_comm->predict[0][1][0] = vp9_convolve8_vert;
+      scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale_comm->predict[1][0][0] = vp9_convolve8_horiz;
+      scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz;
     } else {
       // No scaling in x direction. Must always scale in the y direction.
-      scale->predict[0][0][0] = vp9_convolve8_vert;
-      scale->predict[0][0][1] = vp9_convolve8_avg_vert;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_avg;
+      scale_comm->predict[0][0][0] = vp9_convolve8_vert;
+      scale_comm->predict[0][0][1] = vp9_convolve8_avg_vert;
+      scale_comm->predict[0][1][0] = vp9_convolve8_vert;
+      scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale_comm->predict[1][0][0] = vp9_convolve8;
+      scale_comm->predict[1][0][1] = vp9_convolve8_avg;
     }
   } else {
-    if (scale->y_step_q4 == 16) {
+    if (scale_comm->y_step_q4 == 16) {
       // No scaling in the y direction. Must always scale in the x direction.
-      scale->predict[0][0][0] = vp9_convolve8_horiz;
-      scale->predict[0][0][1] = vp9_convolve8_avg_horiz;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_avg;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+      scale_comm->predict[0][0][0] = vp9_convolve8_horiz;
+      scale_comm->predict[0][0][1] = vp9_convolve8_avg_horiz;
+      scale_comm->predict[0][1][0] = vp9_convolve8;
+      scale_comm->predict[0][1][1] = vp9_convolve8_avg;
+      scale_comm->predict[1][0][0] = vp9_convolve8_horiz;
+      scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz;
     } else {
       // Must always scale in both directions.
-      scale->predict[0][0][0] = vp9_convolve8;
-      scale->predict[0][0][1] = vp9_convolve8_avg;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_avg;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_avg;
+      scale_comm->predict[0][0][0] = vp9_convolve8;
+      scale_comm->predict[0][0][1] = vp9_convolve8_avg;
+      scale_comm->predict[0][1][0] = vp9_convolve8;
+      scale_comm->predict[0][1][1] = vp9_convolve8_avg;
+      scale_comm->predict[1][0][0] = vp9_convolve8;
+      scale_comm->predict[1][0][1] = vp9_convolve8_avg;
     }
   }
   // 2D subpel motion always gets filtered in both directions
-  scale->predict[1][1][0] = vp9_convolve8;
-  scale->predict[1][1][1] = vp9_convolve8_avg;
+  scale_comm->predict[1][1][0] = vp9_convolve8;
+  scale_comm->predict[1][1][1] = vp9_convolve8_avg;
+
+  scale->sfc = scale_comm;
+  scale->x_offset_q4 = 0;  // calculated per block
+  scale->y_offset_q4 = 0;  // calculated per block
 }
diff --git a/libvpx/vp9/common/vp9_scale.h b/libvpx/vp9/common/vp9_scale.h
index 7a720d0..1437fcd 100644
--- a/libvpx/vp9/common/vp9_scale.h
+++ b/libvpx/vp9/common/vp9_scale.h
@@ -18,34 +18,40 @@
 #define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
 #define REF_INVALID_SCALE -1
 
-struct scale_factors {
+struct scale_factors;
+struct scale_factors_common {
   int x_scale_fp;   // horizontal fixed point scale factor
   int y_scale_fp;   // vertical fixed point scale factor
-  int x_offset_q4;
   int x_step_q4;
-  int y_offset_q4;
   int y_step_q4;
 
-  int (*scale_value_x)(int val, const struct scale_factors *scale);
-  int (*scale_value_y)(int val, const struct scale_factors *scale);
+  int (*scale_value_x)(int val, const struct scale_factors_common *sfc);
+  int (*scale_value_y)(int val, const struct scale_factors_common *sfc);
   void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
   MV32 (*scale_mv)(const MV *mv, const struct scale_factors *scale);
 
   convolve_fn_t predict[2][2][2];  // horiz, vert, avg
 };
 
+struct scale_factors {
+  int x_offset_q4;
+  int y_offset_q4;
+  const struct scale_factors_common *sfc;
+};
+
 void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       struct scale_factors_common *scale_comm,
                                        int other_w, int other_h,
                                        int this_w, int this_h);
 
-static int vp9_is_valid_scale(const struct scale_factors *sf) {
-  return sf->x_scale_fp != REF_INVALID_SCALE &&
-         sf->y_scale_fp != REF_INVALID_SCALE;
+static int vp9_is_valid_scale(const struct scale_factors_common *sfc) {
+  return sfc->x_scale_fp != REF_INVALID_SCALE &&
+         sfc->y_scale_fp != REF_INVALID_SCALE;
 }
 
-static int vp9_is_scaled(const struct scale_factors *sf) {
-  return sf->x_scale_fp != REF_NO_SCALE ||
-         sf->y_scale_fp != REF_NO_SCALE;
+static int vp9_is_scaled(const struct scale_factors_common *sfc) {
+  return sfc->x_scale_fp != REF_NO_SCALE ||
+         sfc->y_scale_fp != REF_NO_SCALE;
 }
 
-#endif  //  VP9_COMMON_VP9_SCALE_H_
+#endif  // VP9_COMMON_VP9_SCALE_H_
diff --git a/libvpx/vp9/common/vp9_scan.c b/libvpx/vp9/common/vp9_scan.c
new file mode 100644
index 0000000..f17da91
--- /dev/null
+++ b/libvpx/vp9/common/vp9_scan.c
@@ -0,0 +1,357 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_scan.h"
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
+  0,  4,  1,  5,
+  8,  2, 12,  9,
+  3,  6, 13, 10,
+  7, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
+  0,  4,  8,  1,
+  12,  5,  9,  2,
+  13,  6, 10,  3,
+  7, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
+  0,  1,  4,  2,
+  5,  3,  6,  8,
+  9,  7, 12, 10,
+  13, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
+  0,  8,  1, 16,  9,  2, 17, 24,
+  10,  3, 18, 25, 32, 11,  4, 26,
+  33, 19, 40, 12, 34, 27,  5, 41,
+  20, 48, 13, 35, 42, 28, 21,  6,
+  49, 56, 36, 43, 29,  7, 14, 50,
+  57, 44, 22, 37, 15, 51, 58, 30,
+  45, 23, 52, 59, 38, 31, 60, 53,
+  46, 39, 61, 54, 47, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
+  0,  8, 16,  1, 24,  9, 32, 17,
+  2, 40, 25, 10, 33, 18, 48,  3,
+  26, 41, 11, 56, 19, 34,  4, 49,
+  27, 42, 12, 35, 20, 57, 50, 28,
+  5, 43, 13, 36, 58, 51, 21, 44,
+  6, 29, 59, 37, 14, 52, 22,  7,
+  45, 60, 30, 15, 38, 53, 23, 46,
+  31, 61, 39, 54, 47, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
+  0,  1,  2,  8,  9,  3, 16, 10,
+  4, 17, 11, 24,  5, 18, 25, 12,
+  19, 26, 32,  6, 13, 20, 33, 27,
+  7, 34, 40, 21, 28, 41, 14, 35,
+  48, 42, 29, 36, 49, 22, 43, 15,
+  56, 37, 50, 44, 30, 57, 23, 51,
+  58, 45, 38, 52, 31, 59, 53, 46,
+  60, 39, 61, 47, 54, 55, 62, 63,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
+  0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
+  50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
+  98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
+  100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146,
+  55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25,
+  133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119,
+  26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194,
+  180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59,
+  12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13,
+  226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169,
+  242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108,
+  77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140,
+  230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141,
+  63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142,
+  219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159,
+  251,
+  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
+  255,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
+  0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
+  34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
+  67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
+  146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85,
+  22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179,
+  225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24,
+  87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227,
+  88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167,
+  213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229,
+  74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59,
+  200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170,
+  60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202,
+  233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125,
+  62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79,
+  126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205,
+  236,
+  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
+  255,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
+  0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
+  49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
+  23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
+  25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100,
+  13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102,
+  144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160,
+  89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176,
+  75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136,
+  165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166,
+  167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108,
+  197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170,
+  124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186,
+  156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110,
+  157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111,
+  158,
+  188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220,
+  175,
+  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
+  0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
+  129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
+  68, 131, 37, 100,
+  225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38,
+  258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321,
+  102, 352, 8, 197,
+  71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292,
+  135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293,
+  41, 417, 199, 136,
+  262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105,
+  419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169,
+  295, 420, 106, 451,
+  481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421,
+  75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391,
+  453, 139, 44, 234,
+  484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108,
+  546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577,
+  486, 77, 204, 362,
+  608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173,
+  610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17,
+  111, 238, 48, 143,
+  80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51,
+  83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424,
+  393, 300, 269, 176, 145,
+  52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301,
+  270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581,
+  550, 519, 488, 457, 426, 395,
+  364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737,
+  706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241,
+  210, 179, 117, 86, 55, 738, 707,
+  614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491,
+  367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676,
+  645, 552, 521, 428, 397, 304,
+  273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553,
+  522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26,
+  864, 833, 802, 771, 740, 709,
+  678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306,
+  275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741,
+  710, 679, 617, 586, 555, 493,
+  462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835,
+  742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867,
+  743, 619, 495, 371, 247, 123,
+  896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680,
+  649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929,
+  898, 836, 805, 774, 712, 681,
+  650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154,
+  92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682,
+  651, 620, 589, 558, 527,
+  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124,
+  93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590,
+  559, 497, 466, 435, 373,
+  342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715,
+  622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623,
+  499, 375, 251, 127,
+  900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560,
+  529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716,
+  685, 654, 592, 561,
+  530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903,
+  872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469,
+  438, 407, 376, 345,
+  314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718,
+  687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998,
+  967, 874, 843, 750,
+  719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503,
+  379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657,
+  564, 533, 440, 409,
+  316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534,
+  472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783,
+  752, 721, 690, 659,
+  628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970,
+  939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381,
+  350, 319, 1002, 971,
+  878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631,
+  507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568,
+  537, 444, 413, 972,
+  941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414,
+  1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601,
+  570, 539, 508, 477,
+  446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571,
+  509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479,
+  1007, 883, 759, 635, 511,
+  912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945,
+  914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915,
+  884, 853, 822, 791,
+  760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823,
+  761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607,
+  1011, 887, 763, 639,
+  916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825,
+  794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733,
+  702, 671, 1013, 982,
+  951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015,
+  891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798,
+  1016, 985, 954, 923,
+  892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863,
+  1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021,
+  990, 959, 1022, 991, 1023,
+};
+
+// Neighborhood 5-tuples for various scans and blocksizes,
+// in {top, left, topleft, topright, bottomleft} order
+// for each position in raster scan order.
+// -1 indicates the neighbor does not exist.
+DECLARE_ALIGNED(16, int16_t,
+                vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+
+
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+
+static int find_in_scan(const int16_t *scan, int l, int idx) {
+  int n, l2 = l * l;
+  for (n = 0; n < l2; n++) {
+    int rc = scan[n];
+    if (rc == idx)
+      return  n;
+  }
+  assert(0);
+  return -1;
+}
+static void init_scan_neighbors(const int16_t *scan,
+                                int16_t *iscan,
+                                int l, int16_t *neighbors) {
+  int l2 = l * l;
+  int n, i, j;
+
+  // dc doesn't use this type of prediction
+  neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
+  neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
+  iscan[0] = find_in_scan(scan, l, 0);
+  for (n = 1; n < l2; n++) {
+    int rc = scan[n];
+    iscan[n] = find_in_scan(scan, l, n);
+    i = rc / l;
+    j = rc % l;
+    if (i > 0 && j > 0) {
+      // col/row scan is used for adst/dct, and generally means that
+      // energy decreases to zero much faster in the dimension in
+      // which ADST is used compared to the direction in which DCT
+      // is used. Likewise, we find much higher correlation between
+      // coefficients within the direction in which DCT is used.
+      // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
+      // as a context. If ADST or DCT is used in both directions, we
+      // use the combination of the two as a context.
+      int a = (i - 1) * l + j;
+      int b =  i      * l + j - 1;
+      if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
+          scan == vp9_col_scan_16x16) {
+        // in the col/row scan cases (as well as left/top edge cases), we set
+        // both contexts to the same value, so we can branchlessly do a+b+1>>1
+        // which automatically becomes a if a == b
+        neighbors[MAX_NEIGHBORS * n + 0] =
+        neighbors[MAX_NEIGHBORS * n + 1] = a;
+      } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
+                 scan == vp9_row_scan_16x16) {
+        neighbors[MAX_NEIGHBORS * n + 0] =
+        neighbors[MAX_NEIGHBORS * n + 1] = b;
+      } else {
+        neighbors[MAX_NEIGHBORS * n + 0] = a;
+        neighbors[MAX_NEIGHBORS * n + 1] = b;
+      }
+    } else if (i > 0) {
+      neighbors[MAX_NEIGHBORS * n + 0] =
+      neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
+    } else {
+      assert(j > 0);
+      neighbors[MAX_NEIGHBORS * n + 0] =
+      neighbors[MAX_NEIGHBORS * n + 1] =  i      * l + j - 1;
+    }
+    assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
+  }
+  // one padding item so we don't have to add branches in code to handle
+  // calls to get_coef_context() for the token after the final dc token
+  neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
+  neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
+}
+
+void vp9_init_neighbors() {
+  init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
+                      vp9_default_scan_4x4_neighbors);
+  init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
+                      vp9_row_scan_4x4_neighbors);
+  init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
+                      vp9_col_scan_4x4_neighbors);
+  init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
+                      vp9_default_scan_8x8_neighbors);
+  init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
+                      vp9_row_scan_8x8_neighbors);
+  init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
+                      vp9_col_scan_8x8_neighbors);
+  init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
+                      vp9_default_scan_16x16_neighbors);
+  init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
+                      vp9_row_scan_16x16_neighbors);
+  init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
+                      vp9_col_scan_16x16_neighbors);
+  init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
+                      vp9_default_scan_32x32_neighbors);
+}
diff --git a/libvpx/vp9/common/vp9_scan.h b/libvpx/vp9/common/vp9_scan.h
new file mode 100644
index 0000000..14a1a7e
--- /dev/null
+++ b/libvpx/vp9/common/vp9_scan.h
@@ -0,0 +1,199 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_SCAN_H_
+#define VP9_COMMON_VP9_SCAN_H_
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_enums.h"
+
+#define MAX_NEIGHBORS 2
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+
+
+void vp9_init_neighbors();
+
+static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_scan_4x4;
+    case DCT_ADST:
+      return vp9_col_scan_4x4;
+    default:
+      return vp9_default_scan_4x4;
+  }
+}
+
+static INLINE void get_scan_nb_4x4(TX_TYPE tx_type,
+                                   const int16_t **scan, const int16_t **nb) {
+  switch (tx_type) {
+    case ADST_DCT:
+      *scan = vp9_row_scan_4x4;
+      *nb = vp9_row_scan_4x4_neighbors;
+      break;
+    case DCT_ADST:
+      *scan = vp9_col_scan_4x4;
+      *nb = vp9_col_scan_4x4_neighbors;
+      break;
+    default:
+      *scan = vp9_default_scan_4x4;
+      *nb = vp9_default_scan_4x4_neighbors;
+      break;
+  }
+}
+
+static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_iscan_4x4;
+    case DCT_ADST:
+      return vp9_col_iscan_4x4;
+    default:
+      return vp9_default_iscan_4x4;
+  }
+}
+
+static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_scan_8x8;
+    case DCT_ADST:
+      return vp9_col_scan_8x8;
+    default:
+      return vp9_default_scan_8x8;
+  }
+}
+
+static INLINE void get_scan_nb_8x8(TX_TYPE tx_type,
+                                   const int16_t **scan, const int16_t **nb) {
+  switch (tx_type) {
+    case ADST_DCT:
+      *scan = vp9_row_scan_8x8;
+      *nb = vp9_row_scan_8x8_neighbors;
+      break;
+    case DCT_ADST:
+      *scan = vp9_col_scan_8x8;
+      *nb = vp9_col_scan_8x8_neighbors;
+      break;
+    default:
+      *scan = vp9_default_scan_8x8;
+      *nb = vp9_default_scan_8x8_neighbors;
+      break;
+  }
+}
+
+static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_iscan_8x8;
+    case DCT_ADST:
+      return vp9_col_iscan_8x8;
+    default:
+      return vp9_default_iscan_8x8;
+  }
+}
+
+static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_scan_16x16;
+    case DCT_ADST:
+      return vp9_col_scan_16x16;
+    default:
+      return vp9_default_scan_16x16;
+  }
+}
+
+static INLINE void get_scan_nb_16x16(TX_TYPE tx_type,
+                                     const int16_t **scan, const int16_t **nb) {
+  switch (tx_type) {
+    case ADST_DCT:
+      *scan = vp9_row_scan_16x16;
+      *nb = vp9_row_scan_16x16_neighbors;
+      break;
+    case DCT_ADST:
+      *scan = vp9_col_scan_16x16;
+      *nb = vp9_col_scan_16x16_neighbors;
+      break;
+    default:
+      *scan = vp9_default_scan_16x16;
+      *nb = vp9_default_scan_16x16_neighbors;
+      break;
+  }
+}
+
+static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_iscan_16x16;
+    case DCT_ADST:
+      return vp9_col_iscan_16x16;
+    default:
+      return vp9_default_iscan_16x16;
+  }
+}
+
+static INLINE int get_coef_context(const int16_t *neighbors,
+                                   const uint8_t *token_cache, int c) {
+  return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
+          token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
+}
+
+#endif  // VP9_COMMON_VP9_SCAN_H_
diff --git a/libvpx/vp9/common/vp9_seg_common.c b/libvpx/vp9/common/vp9_seg_common.c
index 6bfd8f8..ef30404 100644
--- a/libvpx/vp9/common/vp9_seg_common.c
+++ b/libvpx/vp9/common/vp9_seg_common.c
@@ -76,7 +76,7 @@ int vp9_get_segdata(const struct segmentation *seg, int segment_id,
 }
 
 
-const vp9_tree_index vp9_segment_tree[14] = {
+const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
   2,  4,  6,  8, 10, 12,
   0, -1, -2, -3, -4, -5, -6, -7
 };
diff --git a/libvpx/vp9/common/vp9_seg_common.h b/libvpx/vp9/common/vp9_seg_common.h
index f22239b..eb38c06 100644
--- a/libvpx/vp9/common/vp9_seg_common.h
+++ b/libvpx/vp9/common/vp9_seg_common.h
@@ -76,7 +76,7 @@ int vp9_get_segdata(const struct segmentation *seg,
                     int segment_id,
                     SEG_LVL_FEATURES feature_id);
 
-extern const vp9_tree_index vp9_segment_tree[14];
+extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
 
 #endif  // VP9_COMMON_VP9_SEG_COMMON_H_
 
diff --git a/libvpx/vp9/common/vp9_subpelvar.h b/libvpx/vp9/common/vp9_subpelvar.h
deleted file mode 100644
index fe75481..0000000
--- a/libvpx/vp9/common/vp9_subpelvar.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_SUBPELVAR_H_
-#define VP9_COMMON_VP9_SUBPELVAR_H_
-
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_convolve.h"
-
-static void variance(const uint8_t *src_ptr,
-                     int  source_stride,
-                     const uint8_t *ref_ptr,
-                     int  recon_stride,
-                     int  w,
-                     int  h,
-                     unsigned int *sse,
-                     int *sum) {
-  int i, j;
-  int diff;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      diff = src_ptr[j] - ref_ptr[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    src_ptr += source_stride;
-    ref_ptr += recon_stride;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_first_pass
- *
- *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
- *                  uint32_t src_pixels_per_line : Stride of input block.
- *                  uint32_t pixel_step        : Offset between filter input samples (see notes).
- *                  uint32_t output_height     : Input block height.
- *                  uint32_t output_width      : Input block width.
- *                  int32_t  *vp9_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : int32_t *output_ptr        : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
-                                              uint16_t *output_ptr,
-                                              unsigned int src_pixels_per_line,
-                                              int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const int16_t *vp9_filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-
-      src_ptr++;
-    }
-
-    // Next row...
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_second_pass
- *
- *  INPUTS        : int32_t  *src_ptr          : Pointer to source block.
- *                  uint32_t src_pixels_per_line : Stride of input block.
- *                  uint32_t pixel_step        : Offset between filter input samples (see notes).
- *                  uint32_t output_height     : Input block height.
- *                  uint32_t output_width      : Input block width.
- *                  int32_t  *vp9_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : uint16_t *output_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
-                                               uint8_t *output_ptr,
-                                               unsigned int src_pixels_per_line,
-                                               unsigned int pixel_step,
-                                               unsigned int output_height,
-                                               unsigned int output_width,
-                                               const int16_t *vp9_filter) {
-  unsigned int  i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-      src_ptr++;
-    }
-
-    src_ptr += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-#endif  // VP9_COMMON_VP9_SUBPELVAR_H_
diff --git a/libvpx/vp9/common/vp9_systemdependent.h b/libvpx/vp9/common/vp9_systemdependent.h
index cc909e2..254a431 100644
--- a/libvpx/vp9/common/vp9_systemdependent.h
+++ b/libvpx/vp9/common/vp9_systemdependent.h
@@ -13,6 +13,7 @@
 
 #ifdef _MSC_VER
 #include <math.h>
+#define snprintf _snprintf
 #endif
 
 #include "./vpx_config.h"
@@ -23,8 +24,8 @@ void vpx_reset_mmx_state(void);
 #define vp9_clear_system_state()
 #endif
 
-#ifdef _MSC_VER
-// round is not defined in MSVC
+#if defined(_MSC_VER) && _MSC_VER < 1800
+// round is not defined in MSVC before VS2013.
 static int round(double x) {
   if (x < 0)
     return (int)ceil(x - 0.5);
diff --git a/libvpx/vp9/common/vp9_tile_common.c b/libvpx/vp9/common/vp9_tile_common.c
index 1791c1a..e3035d0 100644
--- a/libvpx/vp9/common/vp9_tile_common.c
+++ b/libvpx/vp9/common/vp9_tile_common.c
@@ -10,6 +10,8 @@
 
 #include "vp9/common/vp9_tile_common.h"
 
+#include "vp9/common/vp9_onyxc_int.h"
+
 #define MIN_TILE_WIDTH_B64 4
 #define MAX_TILE_WIDTH_B64 64
 
@@ -17,8 +19,8 @@ static int to_sbs(n_mis) {
   return mi_cols_aligned_to_sb(n_mis) >> MI_BLOCK_SIZE_LOG2;
 }
 
-static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off,
-                                 int tile_idx, int log2_n_tiles, int n_mis) {
+static void get_tile_offsets(int *min_tile_off, int *max_tile_off,
+                             int tile_idx, int log2_n_tiles, int n_mis) {
   const int n_sbs = to_sbs(n_mis);
   const int sb_off1 =  (tile_idx      * n_sbs) >> log2_n_tiles;
   const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
@@ -27,17 +29,14 @@ static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off,
   *max_tile_off = MIN(sb_off2 << 3, n_mis);
 }
 
-void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) {
-  vp9_get_tile_offsets(&cm->cur_tile_mi_col_start, &cm->cur_tile_mi_col_end,
-                       tile_col_idx, cm->log2_tile_cols, cm->mi_cols);
-}
-
-void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) {
-  vp9_get_tile_offsets(&cm->cur_tile_mi_row_start, &cm->cur_tile_mi_row_end,
-                       tile_row_idx, cm->log2_tile_rows, cm->mi_rows);
+void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm,
+                   int row_idx, int col_idx) {
+  get_tile_offsets(&tile->mi_row_start, &tile->mi_row_end,
+                   row_idx, cm->log2_tile_rows, cm->mi_rows);
+  get_tile_offsets(&tile->mi_col_start, &tile->mi_col_end,
+                   col_idx, cm->log2_tile_cols, cm->mi_cols);
 }
 
-
 void vp9_get_tile_n_bits(int mi_cols,
                          int *min_log2_tile_cols, int *max_log2_tile_cols) {
   const int sb_cols = to_sbs(mi_cols);
diff --git a/libvpx/vp9/common/vp9_tile_common.h b/libvpx/vp9/common/vp9_tile_common.h
index 6d14560..a110abb 100644
--- a/libvpx/vp9/common/vp9_tile_common.h
+++ b/libvpx/vp9/common/vp9_tile_common.h
@@ -11,11 +11,17 @@
 #ifndef VP9_COMMON_VP9_TILE_COMMON_H_
 #define VP9_COMMON_VP9_TILE_COMMON_H_
 
-#include "vp9/common/vp9_onyxc_int.h"
+struct VP9Common;
 
-void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx);
+typedef struct TileInfo {
+  int mi_row_start, mi_row_end;
+  int mi_col_start, mi_col_end;
+} TileInfo;
 
-void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx);
+// initializes 'tile->mi_(row|col)_(start|end)' for (row_idx, col_idx) based on
+// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
+void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm,
+                   int row_idx, int col_idx);
 
 void vp9_get_tile_n_bits(int mi_cols,
                          int *min_log2_tile_cols, int *max_log2_tile_cols);
diff --git a/libvpx/vp9/common/vp9_treecoder.c b/libvpx/vp9/common/vp9_treecoder.c
index 2e21a5b..1805fb4 100644
--- a/libvpx/vp9/common/vp9_treecoder.c
+++ b/libvpx/vp9/common/vp9_treecoder.c
@@ -25,8 +25,9 @@ static void tree2tok(struct vp9_token *const p, vp9_tree t,
     if (j <= 0) {
       p[-j].value = v;
       p[-j].len = l;
-    } else
+    } else {
       tree2tok(p, t, j, v, l);
+    }
   } while (++v & 1);
 }
 
@@ -39,9 +40,7 @@ void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t,
   tree2tok(p - offset, t, 0, 0, 0);
 }
 
-static unsigned int convert_distribution(unsigned int i,
-                                         vp9_tree tree,
-                                         vp9_prob probs[],
+static unsigned int convert_distribution(unsigned int i, vp9_tree tree,
                                          unsigned int branch_ct[][2],
                                          const unsigned int num_events[],
                                          unsigned int tok0_offset) {
@@ -50,26 +49,25 @@ static unsigned int convert_distribution(unsigned int i,
   if (tree[i] <= 0) {
     left = num_events[-tree[i] - tok0_offset];
   } else {
-    left = convert_distribution(tree[i], tree, probs, branch_ct,
-                                num_events, tok0_offset);
+    left = convert_distribution(tree[i], tree, branch_ct, num_events,
+                                tok0_offset);
   }
   if (tree[i + 1] <= 0)
     right = num_events[-tree[i + 1] - tok0_offset];
   else
-    right = convert_distribution(tree[i + 1], tree, probs, branch_ct,
-                                 num_events, tok0_offset);
+    right = convert_distribution(tree[i + 1], tree, branch_ct, num_events,
+                                 tok0_offset);
 
-  probs[i>>1] = get_binary_prob(left, right);
-  branch_ct[i>>1][0] = left;
-  branch_ct[i>>1][1] = right;
+  branch_ct[i >> 1][0] = left;
+  branch_ct[i >> 1][1] = right;
   return left + right;
 }
 
-void vp9_tree_probs_from_distribution(
-  vp9_tree tree,
-  vp9_prob probs          [ /* n-1 */ ],
-  unsigned int branch_ct       [ /* n-1 */ ] [2],
-  const unsigned int num_events[ /* n */ ],
-  unsigned int tok0_offset) {
-  convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset);
+void vp9_tree_probs_from_distribution(vp9_tree tree,
+                                      unsigned int branch_ct[/* n-1 */][2],
+                                      const unsigned int num_events[/* n */],
+                                      unsigned int tok0_offset) {
+  convert_distribution(0, tree, branch_ct, num_events, tok0_offset);
 }
+
+
diff --git a/libvpx/vp9/common/vp9_treecoder.h b/libvpx/vp9/common/vp9_treecoder.h
index 31182c3..9c776d6 100644
--- a/libvpx/vp9/common/vp9_treecoder.h
+++ b/libvpx/vp9/common/vp9_treecoder.h
@@ -21,6 +21,8 @@ typedef uint8_t vp9_prob;
 
 typedef int8_t vp9_tree_index;
 
+#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
+
 #define vp9_complement(x) (255 - x)
 
 /* We build coding trees compactly in arrays.
@@ -30,7 +32,7 @@ typedef int8_t vp9_tree_index;
    Index > 0 means need another bit, specification at index.
    Nonnegative indices are always even;  processing begins at node 0. */
 
-typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;
+typedef const vp9_tree_index vp9_tree[];
 
 struct vp9_token {
   int value;
@@ -48,11 +50,11 @@ void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset);
    probability updates. */
 
 void vp9_tree_probs_from_distribution(vp9_tree tree,
-                                      vp9_prob probs[ /* n - 1 */ ],
                                       unsigned int branch_ct[ /* n - 1 */ ][2],
                                       const unsigned int num_events[ /* n */ ],
                                       unsigned int tok0_offset);
 
+
 static INLINE vp9_prob clip_prob(int p) {
   return (p > 255) ? 255u : (p < 1) ? 1u : p;
 }
@@ -79,21 +81,46 @@ static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
   return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
 }
 
-static INLINE vp9_prob merge_probs(vp9_prob pre_prob, vp9_prob prob,
+static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
                                    const unsigned int ct[2],
                                    unsigned int count_sat,
                                    unsigned int max_update_factor) {
+  const vp9_prob prob = get_binary_prob(ct[0], ct[1]);
   const unsigned int count = MIN(ct[0] + ct[1], count_sat);
   const unsigned int factor = max_update_factor * count / count_sat;
   return weighted_prob(pre_prob, prob, factor);
 }
 
-static INLINE vp9_prob merge_probs2(vp9_prob pre_prob,
-                                   const unsigned int ct[2],
-                                   unsigned int count_sat,
-                                   unsigned int max_update_factor) {
-  return merge_probs(pre_prob, get_binary_prob(ct[0], ct[1]), ct, count_sat,
-                     max_update_factor);
+static unsigned int tree_merge_probs_impl(unsigned int i,
+                                          const vp9_tree_index *tree,
+                                          const vp9_prob *pre_probs,
+                                          const unsigned int *counts,
+                                          unsigned int count_sat,
+                                          unsigned int max_update_factor,
+                                          vp9_prob *probs) {
+  const int l = tree[i];
+  const unsigned int left_count = (l <= 0)
+                 ? counts[-l]
+                 : tree_merge_probs_impl(l, tree, pre_probs, counts,
+                                         count_sat, max_update_factor, probs);
+  const int r = tree[i + 1];
+  const unsigned int right_count = (r <= 0)
+                 ? counts[-r]
+                 : tree_merge_probs_impl(r, tree, pre_probs, counts,
+                                         count_sat, max_update_factor, probs);
+  const unsigned int ct[2] = { left_count, right_count };
+  probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
+                              count_sat, max_update_factor);
+  return left_count + right_count;
+}
+
+static void tree_merge_probs(const vp9_tree_index *tree,
+                             const vp9_prob *pre_probs,
+                             const unsigned int *counts, int offset,
+                             unsigned int count_sat,
+                             unsigned int max_update_factor, vp9_prob *probs) {
+  tree_merge_probs_impl(0, tree, pre_probs, &counts[-offset],
+                        count_sat, max_update_factor, probs);
 }
 
 
diff --git a/libvpx/vp9/common/x86/vp9_asm_stubs.c b/libvpx/vp9/common/x86/vp9_asm_stubs.c
index 3f1c198..106e6d4 100644
--- a/libvpx/vp9/common/x86/vp9_asm_stubs.c
+++ b/libvpx/vp9/common/x86/vp9_asm_stubs.c
@@ -36,90 +36,28 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
   {   8,  8,  8,  8, 120, 120, 120, 120 }
 };
 
-#if HAVE_SSSE3
-void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
+typedef void filter8_1dfunction (
+  const unsigned char *src_ptr,
+  const unsigned int src_pitch,
+  unsigned char *output_ptr,
+  unsigned int out_pitch,
+  unsigned int output_height,
+  const short *filter
+);
 
-void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr,
-                                       const unsigned int src_pitch,
-                                       unsigned char *output_ptr,
-                                       unsigned int out_pitch,
-                                       unsigned int output_height,
-                                       const short *filter);
-
-void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr,
-                                       const unsigned int src_pitch,
-                                       unsigned char *output_ptr,
-                                       unsigned int out_pitch,
-                                       unsigned int output_height,
-                                       const short *filter);
-
-void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr,
-                                     const unsigned int src_pitch,
-                                     unsigned char *output_ptr,
-                                     unsigned int out_pitch,
-                                     unsigned int output_height,
-                                     const short *filter);
-
-void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr,
-                                     const unsigned int src_pitch,
-                                     unsigned char *output_ptr,
-                                     unsigned int out_pitch,
-                                     unsigned int output_height,
-                                     const short *filter);
-
-void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr,
-                                     const unsigned int src_pitch,
-                                     unsigned char *output_ptr,
-                                     unsigned int out_pitch,
-                                     unsigned int output_height,
-                                     const short *filter);
-
-void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
-                                     const unsigned int src_pitch,
-                                     unsigned char *output_ptr,
-                                     unsigned int out_pitch,
-                                     unsigned int output_height,
-                                     const short *filter);
+#if HAVE_SSSE3
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
 
 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -279,7 +217,7 @@ void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
 
   assert(w <= 64);
   assert(h <= 64);
@@ -300,7 +238,7 @@ void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
 
   assert(w <= 64);
   assert(h <= 64);
@@ -317,3 +255,214 @@ void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 #endif
+
+#if HAVE_SSE2
+filter8_1dfunction vp9_filter_block1d16_v8_sse2;
+filter8_1dfunction vp9_filter_block1d16_h8_sse2;
+filter8_1dfunction vp9_filter_block1d8_v8_sse2;
+filter8_1dfunction vp9_filter_block1d8_h8_sse2;
+filter8_1dfunction vp9_filter_block1d4_v8_sse2;
+filter8_1dfunction vp9_filter_block1d4_h8_sse2;
+filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
+
+void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  /* Ensure the filter can be compressed to int16_t. */
+  if (x_step_q4 == 16 && filter_x[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_h8_sse2(src, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_x);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_h8_sse2(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_h8_sse2(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                          filter_x, x_step_q4, filter_y, y_step_q4,
+                          w, h);
+  }
+}
+
+void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (y_step_q4 == 16 && filter_y[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_y);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                         filter_x, x_step_q4, filter_y, y_step_q4,
+                         w, h);
+  }
+}
+
+void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  if (x_step_q4 == 16 && filter_x[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_h8_avg_sse2(src, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_x);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_h8_avg_sse2(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_h8_avg_sse2(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+  }
+}
+
+void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (y_step_q4 == 16 && filter_y[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_y);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4,
+                             w, h);
+  }
+}
+
+void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
+
+  assert(w <= 64);
+  assert(h <= 64);
+  if (x_step_q4 == 16 && y_step_q4 == 16) {
+    vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h + 7);
+    vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  } else {
+    vp9_convolve8_c(src, src_stride, dst, dst_stride,
+                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  }
+}
+
+void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
+
+  assert(w <= 64);
+  assert(h <= 64);
+  if (x_step_q4 == 16 && y_step_q4 == 16) {
+    vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h + 7);
+    vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
+                                 filter_x, x_step_q4, filter_y, y_step_q4,
+                                 w, h);
+  } else {
+    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                        filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  }
+}
+#endif
diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index 8f740f4..ccf5aac 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -15,7 +15,7 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -26,10 +26,10 @@ void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   __m128i input0, input1, input2, input3;
 
   // Rows
-  input0 = _mm_loadl_epi64((__m128i *)input);
-  input1 = _mm_loadl_epi64((__m128i *)(input + 4));
-  input2 = _mm_loadl_epi64((__m128i *)(input + 8));
-  input3 = _mm_loadl_epi64((__m128i *)(input + 12));
+  input0 = _mm_loadl_epi64((const __m128i *)input);
+  input1 = _mm_loadl_epi64((const __m128i *)(input + 4));
+  input2 = _mm_loadl_epi64((const __m128i *)(input + 8));
+  input3 = _mm_loadl_epi64((const __m128i *)(input + 12));
 
   // Construct i3, i1, i3, i1, i2, i0, i2, i0
   input0 = _mm_shufflelo_epi16(input0, 0xd8);
@@ -148,7 +148,7 @@ void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE4X4(dest, input3);
 }
 
-void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a;
@@ -165,41 +165,6 @@ void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE4X4(dest, dc_value);
 }
 
-void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
-                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
-                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
-  const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);
-
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i in, temp;
-
-  // Load input data.
-  in = _mm_loadl_epi64((__m128i *)input);
-
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  in = _mm_shufflelo_epi16(in, 0xd8);
-  in = _mm_unpacklo_epi32(in, in);
-
-  // Stage 1
-  in = _mm_madd_epi16(in, c1);
-  in = _mm_add_epi32(in, rounding);
-  in = _mm_srai_epi32(in, DCT_CONST_BITS);
-  in = _mm_packs_epi32(in, zero);
-
-  // Stage 2
-  temp = _mm_shufflelo_epi16(in, 0x9c);
-  in = _mm_shufflelo_epi16(in, 0xc9);
-  in = _mm_unpacklo_epi64(temp, in);
-  in = _mm_madd_epi16(in, c2);
-  in = _mm_packs_epi32(in, zero);
-
-  // Store results
-  _mm_storel_epi64((__m128i *)output, in);
-}
-
 static INLINE void transpose_4x4(__m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
   const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
@@ -210,7 +175,7 @@ static INLINE void transpose_4x4(__m128i *res) {
   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
 }
 
-void idct4_1d_sse2(__m128i *in) {
+static void idct4_1d_sse2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -249,7 +214,7 @@ void idct4_1d_sse2(__m128i *in) {
   in[3] = _mm_sub_epi16(u[0], u[3]);
 }
 
-void iadst4_1d_sse2(__m128i *in) {
+static void iadst4_1d_sse2(__m128i *in) {
   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -299,16 +264,16 @@ void iadst4_1d_sse2(__m128i *in) {
   in[3] = _mm_unpackhi_epi64(in[1], in[1]);
 }
 
-void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
-                               int tx_type) {
+void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
   __m128i in[4];
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0] = _mm_loadl_epi64((__m128i *)input);
-  in[1] = _mm_loadl_epi64((__m128i *)(input + 4));
-  in[2] = _mm_loadl_epi64((__m128i *)(input + 8));
-  in[3] = _mm_loadl_epi64((__m128i *)(input + 12));
+  in[0] = _mm_loadl_epi64((const __m128i *)input);
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -450,7 +415,7 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
       res3 = _mm_packs_epi32(tmp6, tmp7); \
   }
 
-#define IDCT8x8_1D  \
+#define IDCT8_1D  \
   /* Stage1 */      \
   { \
     const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
@@ -529,7 +494,7 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
       dest += stride; \
   }
 
-void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -549,23 +514,23 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   int i;
 
   // Load input data.
-  in0 = _mm_load_si128((__m128i *)input);
-  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
-  in4 = _mm_load_si128((__m128i *)(input + 8 * 4));
-  in5 = _mm_load_si128((__m128i *)(input + 8 * 5));
-  in6 = _mm_load_si128((__m128i *)(input + 8 * 6));
-  in7 = _mm_load_si128((__m128i *)(input + 8 * 7));
+  in0 = _mm_load_si128((const __m128i *)input);
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
 
   // 2-D
   for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
+    // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
     TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                   in4, in5, in6, in7);
 
     // 4-stage 1D idct8x8
-    IDCT8x8_1D
+    IDCT8_1D
   }
 
   // Final rounding and shift
@@ -597,7 +562,7 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest, in7);
 }
 
-void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a;
@@ -648,7 +613,7 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
 }
 
-void idct8_1d_sse2(__m128i *in) {
+static void idct8_1d_sse2(__m128i *in) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
@@ -673,12 +638,12 @@ void idct8_1d_sse2(__m128i *in) {
   in6 = in[6];
   in7 = in[7];
 
-  // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
+  // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
   TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                 in4, in5, in6, in7);
 
   // 4-stage 1D idct8x8
-  IDCT8x8_1D
+  IDCT8_1D
   in[0] = in0;
   in[1] = in1;
   in[2] = in2;
@@ -689,7 +654,7 @@ void idct8_1d_sse2(__m128i *in) {
   in[7] = in7;
 }
 
-void iadst8_1d_sse2(__m128i *in) {
+static void iadst8_1d_sse2(__m128i *in) {
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -918,21 +883,21 @@ void iadst8_1d_sse2(__m128i *in) {
 }
 
 
-void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
-                               int tx_type) {
+void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
   __m128i in[8];
   const __m128i zero = _mm_setzero_si128();
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
 
   // load input data
-  in[0] = _mm_load_si128((__m128i *)input);
-  in[1] = _mm_load_si128((__m128i *)(input + 8 * 1));
-  in[2] = _mm_load_si128((__m128i *)(input + 8 * 2));
-  in[3] = _mm_load_si128((__m128i *)(input + 8 * 3));
-  in[4] = _mm_load_si128((__m128i *)(input + 8 * 4));
-  in[5] = _mm_load_si128((__m128i *)(input + 8 * 5));
-  in[6] = _mm_load_si128((__m128i *)(input + 8 * 6));
-  in[7] = _mm_load_si128((__m128i *)(input + 8 * 7));
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -985,7 +950,7 @@ void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
   RECON_AND_STORE(dest, in[7]);
 }
 
-void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -1005,16 +970,16 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
   // Rows. Load 4-row input data.
-  in0 = _mm_load_si128((__m128i *)input);
-  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
+  in0 = _mm_load_si128((const __m128i *)input);
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
 
   // 8x4 Transpose
   TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
 
   // Stage1
-  {
+  { //NOLINT
     const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
     const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
 
@@ -1039,7 +1004,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   }
 
   // Stage2
-  {
+  { //NOLINT
     const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
     const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
 
@@ -1069,7 +1034,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   }
 
   // Stage3
-  {
+  { //NOLINT
     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
     stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
     stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
@@ -1103,7 +1068,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
                 in4, in5, in6, in7)
 
   // 1D idct8x8
-  IDCT8x8_1D
+  IDCT8_1D
 
   // Final rounding and shift
   in0 = _mm_adds_epi16(in0, final_rounding);
@@ -1134,7 +1099,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest, in7);
 }
 
-#define IDCT16x16_1D \
+#define IDCT16_1D \
   /* Stage2 */ \
   { \
     const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
@@ -1263,7 +1228,8 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
                            stp2_10, stp2_13, stp2_11, stp2_12) \
   }
 
-void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
+                                int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -1318,22 +1284,22 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
       if (i == 1) input += 128;
 
       // Load input data.
-      in0 = _mm_load_si128((__m128i *)input);
-      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
-      in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
-      in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
-      in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
-      in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
-      in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
-      in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
-      in4 = _mm_load_si128((__m128i *)(input + 8 * 8));
-      in12 = _mm_load_si128((__m128i *)(input + 8 * 9));
-      in5 = _mm_load_si128((__m128i *)(input + 8 * 10));
-      in13 = _mm_load_si128((__m128i *)(input + 8 * 11));
-      in6 = _mm_load_si128((__m128i *)(input + 8 * 12));
-      in14 = _mm_load_si128((__m128i *)(input + 8 * 13));
-      in7 = _mm_load_si128((__m128i *)(input + 8 * 14));
-      in15 = _mm_load_si128((__m128i *)(input + 8 * 15));
+      in0 = _mm_load_si128((const __m128i *)input);
+      in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+      in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+      in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+      in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+      in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+      in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+      in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+      in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
+      in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
+      in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
+      in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
+      in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
+      in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
+      in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
+      in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
 
       TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                     in4, in5, in6, in7);
@@ -1355,7 +1321,7 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
                     in12, in13, in14, in15);
     }
 
-    IDCT16x16_1D
+    IDCT16_1D
 
     // Stage7
     if (i == 0) {
@@ -1470,7 +1436,7 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
-void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a, i;
@@ -1519,7 +1485,7 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
   res0[15] = tbuf[7];
 }
 
-void iadst16_1d_8col(__m128i *in) {
+static void iadst16_1d_8col(__m128i *in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -1989,7 +1955,7 @@ void iadst16_1d_8col(__m128i *in) {
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-void idct16_1d_8col(__m128i *in) {
+static void idct16_1d_8col(__m128i *in) {
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
@@ -2333,36 +2299,36 @@ void idct16_1d_8col(__m128i *in) {
   in[15] = _mm_sub_epi16(s[0], s[15]);
 }
 
-void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
   idct16_1d_8col(in0);
   idct16_1d_8col(in1);
 }
 
-void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
   iadst16_1d_8col(in0);
   iadst16_1d_8col(in1);
 }
 
-static INLINE void load_buffer_8x16(int16_t *input, __m128i *in) {
-  in[0]  = _mm_load_si128((__m128i *)(input + 0 * 16));
-  in[1]  = _mm_load_si128((__m128i *)(input + 1 * 16));
-  in[2]  = _mm_load_si128((__m128i *)(input + 2 * 16));
-  in[3]  = _mm_load_si128((__m128i *)(input + 3 * 16));
-  in[4]  = _mm_load_si128((__m128i *)(input + 4 * 16));
-  in[5]  = _mm_load_si128((__m128i *)(input + 5 * 16));
-  in[6]  = _mm_load_si128((__m128i *)(input + 6 * 16));
-  in[7]  = _mm_load_si128((__m128i *)(input + 7 * 16));
-
-  in[8]  = _mm_load_si128((__m128i *)(input + 8 * 16));
-  in[9]  = _mm_load_si128((__m128i *)(input + 9 * 16));
-  in[10]  = _mm_load_si128((__m128i *)(input + 10 * 16));
-  in[11]  = _mm_load_si128((__m128i *)(input + 11 * 16));
-  in[12]  = _mm_load_si128((__m128i *)(input + 12 * 16));
-  in[13]  = _mm_load_si128((__m128i *)(input + 13 * 16));
-  in[14]  = _mm_load_si128((__m128i *)(input + 14 * 16));
-  in[15]  = _mm_load_si128((__m128i *)(input + 15 * 16));
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
+
+  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
+  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
+  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
+  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
+  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
+  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
+  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
+  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
 }
 
 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
@@ -2421,8 +2387,8 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
   RECON_AND_STORE(dest, in[15]);
 }
 
-void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride,
-                                 int tx_type) {
+void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                               int tx_type) {
   __m128i in0[16], in1[16];
 
   load_buffer_8x16(input, in0);
@@ -2456,8 +2422,8 @@ void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride,
   write_buffer_8x16(dest, in1, stride);
 }
 
-void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
-                                     int stride) {
+void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
+                               int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -2503,14 +2469,14 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
   // 1-D idct. Load input data.
-  in0 = _mm_load_si128((__m128i *)input);
-  in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
-  in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
-  in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
-  in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
-  in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
-  in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
-  in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
+  in0 = _mm_load_si128((const __m128i *)input);
+  in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
 
   TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
   TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
@@ -2737,7 +2703,7 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
 
     in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
 
-    IDCT16x16_1D
+    IDCT16_1D
 
     // Stage7
     in0 = _mm_add_epi16(stp2_0, stp1_15);
@@ -2815,11 +2781,704 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
 
 #define LOAD_DQCOEFF(reg, input) \
   {  \
-    reg = _mm_load_si128((__m128i *) input); \
+    reg = _mm_load_si128((const __m128i *) input); \
     input += 8; \
   }  \
 
-void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+#define IDCT32_1D \
+/* Stage1 */ \
+{ \
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
+  const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
+  const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
+  \
+  const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
+  const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
+  const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
+  const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
+  \
+  const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
+  const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
+  \
+  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
+                         stp1_17, stp1_30) \
+  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
+                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
+  const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
+  const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
+  \
+  const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
+  const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
+  \
+  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+                         stp2_14) \
+  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
+                         stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+  \
+  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+  \
+  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
+  const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
+  const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  \
+  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+                         stp1_6) \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
+  const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
+  const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  \
+  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
+                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
+                         stp2_2, stp2_3) \
+  \
+  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
+                                 int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+  // idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
+          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
+          in24, in25, in26, in27, in28, in29, in30, in31;
+  __m128i col[128];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+          stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+          stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i, j, i32;
+
+  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
+  for (i = 0; i < 8; i++) {
+    i32 = (i << 5);
+    if (i == 0) {
+      // First 1-D idct: first 8 rows
+      // Load input data.
+      LOAD_DQCOEFF(in0, input);
+      LOAD_DQCOEFF(in8, input);
+      LOAD_DQCOEFF(in16, input);
+      LOAD_DQCOEFF(in24, input);
+      LOAD_DQCOEFF(in1, input);
+      LOAD_DQCOEFF(in9, input);
+      LOAD_DQCOEFF(in17, input);
+      LOAD_DQCOEFF(in25, input);
+      LOAD_DQCOEFF(in2, input);
+      LOAD_DQCOEFF(in10, input);
+      LOAD_DQCOEFF(in18, input);
+      LOAD_DQCOEFF(in26, input);
+      LOAD_DQCOEFF(in3, input);
+      LOAD_DQCOEFF(in11, input);
+      LOAD_DQCOEFF(in19, input);
+      LOAD_DQCOEFF(in27, input);
+
+      LOAD_DQCOEFF(in4, input);
+      LOAD_DQCOEFF(in12, input);
+      LOAD_DQCOEFF(in20, input);
+      LOAD_DQCOEFF(in28, input);
+      LOAD_DQCOEFF(in5, input);
+      LOAD_DQCOEFF(in13, input);
+      LOAD_DQCOEFF(in21, input);
+      LOAD_DQCOEFF(in29, input);
+      LOAD_DQCOEFF(in6, input);
+      LOAD_DQCOEFF(in14, input);
+      LOAD_DQCOEFF(in22, input);
+      LOAD_DQCOEFF(in30, input);
+      LOAD_DQCOEFF(in7, input);
+      LOAD_DQCOEFF(in15, input);
+      LOAD_DQCOEFF(in23, input);
+      LOAD_DQCOEFF(in31, input);
+
+      // Transpose 32x8 block to 8x32 block
+      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                    in4, in5, in6, in7);
+      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                    in10, in11, in12, in13, in14, in15);
+      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
+                    in18, in19, in20, in21, in22, in23);
+      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
+                    in26, in27, in28, in29, in30, in31);
+    } else if (i < 4) {
+      // First 1-D idct: next 24 zero-coeff rows
+      col[i32 + 0] = _mm_setzero_si128();
+      col[i32 + 1] = _mm_setzero_si128();
+      col[i32 + 2] = _mm_setzero_si128();
+      col[i32 + 3] = _mm_setzero_si128();
+      col[i32 + 4] = _mm_setzero_si128();
+      col[i32 + 5] = _mm_setzero_si128();
+      col[i32 + 6] = _mm_setzero_si128();
+      col[i32 + 7] = _mm_setzero_si128();
+      col[i32 + 8] = _mm_setzero_si128();
+      col[i32 + 9] = _mm_setzero_si128();
+      col[i32 + 10] = _mm_setzero_si128();
+      col[i32 + 11] = _mm_setzero_si128();
+      col[i32 + 12] = _mm_setzero_si128();
+      col[i32 + 13] = _mm_setzero_si128();
+      col[i32 + 14] = _mm_setzero_si128();
+      col[i32 + 15] = _mm_setzero_si128();
+      col[i32 + 16] = _mm_setzero_si128();
+      col[i32 + 17] = _mm_setzero_si128();
+      col[i32 + 18] = _mm_setzero_si128();
+      col[i32 + 19] = _mm_setzero_si128();
+      col[i32 + 20] = _mm_setzero_si128();
+      col[i32 + 21] = _mm_setzero_si128();
+      col[i32 + 22] = _mm_setzero_si128();
+      col[i32 + 23] = _mm_setzero_si128();
+      col[i32 + 24] = _mm_setzero_si128();
+      col[i32 + 25] = _mm_setzero_si128();
+      col[i32 + 26] = _mm_setzero_si128();
+      col[i32 + 27] = _mm_setzero_si128();
+      col[i32 + 28] = _mm_setzero_si128();
+      col[i32 + 29] = _mm_setzero_si128();
+      col[i32 + 30] = _mm_setzero_si128();
+      col[i32 + 31] = _mm_setzero_si128();
+      continue;
+    } else {
+      // Second 1-D idct
+      j = i - 4;
+
+      // Transpose 32x8 block to 8x32 block
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
+                    in5, in6, in7);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
+                    in11, in12, in13, in14, in15);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
+                    in19, in20, in21, in22, in23);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
+                    in28, in29, in30, in31);
+    }
+
+    IDCT32_1D
+
+    // final stage
+    if (i < 4) {
+      // 1_D: Store 32 intermediate results for each 8x32 block.
+      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+    } else {
+      const __m128i zero = _mm_setzero_si128();
+
+      // 2_D: Calculate the results and store them to destination.
+      in0 = _mm_add_epi16(stp1_0, stp1_31);
+      in1 = _mm_add_epi16(stp1_1, stp1_30);
+      in2 = _mm_add_epi16(stp1_2, stp1_29);
+      in3 = _mm_add_epi16(stp1_3, stp1_28);
+      in4 = _mm_add_epi16(stp1_4, stp1_27);
+      in5 = _mm_add_epi16(stp1_5, stp1_26);
+      in6 = _mm_add_epi16(stp1_6, stp1_25);
+      in7 = _mm_add_epi16(stp1_7, stp1_24);
+      in8 = _mm_add_epi16(stp1_8, stp1_23);
+      in9 = _mm_add_epi16(stp1_9, stp1_22);
+      in10 = _mm_add_epi16(stp1_10, stp1_21);
+      in11 = _mm_add_epi16(stp1_11, stp1_20);
+      in12 = _mm_add_epi16(stp1_12, stp1_19);
+      in13 = _mm_add_epi16(stp1_13, stp1_18);
+      in14 = _mm_add_epi16(stp1_14, stp1_17);
+      in15 = _mm_add_epi16(stp1_15, stp1_16);
+      in16 = _mm_sub_epi16(stp1_15, stp1_16);
+      in17 = _mm_sub_epi16(stp1_14, stp1_17);
+      in18 = _mm_sub_epi16(stp1_13, stp1_18);
+      in19 = _mm_sub_epi16(stp1_12, stp1_19);
+      in20 = _mm_sub_epi16(stp1_11, stp1_20);
+      in21 = _mm_sub_epi16(stp1_10, stp1_21);
+      in22 = _mm_sub_epi16(stp1_9, stp1_22);
+      in23 = _mm_sub_epi16(stp1_8, stp1_23);
+      in24 = _mm_sub_epi16(stp1_7, stp1_24);
+      in25 = _mm_sub_epi16(stp1_6, stp1_25);
+      in26 = _mm_sub_epi16(stp1_5, stp1_26);
+      in27 = _mm_sub_epi16(stp1_4, stp1_27);
+      in28 = _mm_sub_epi16(stp1_3, stp1_28);
+      in29 = _mm_sub_epi16(stp1_2, stp1_29);
+      in30 = _mm_sub_epi16(stp1_1, stp1_30);
+      in31 = _mm_sub_epi16(stp1_0, stp1_31);
+
+      // Final rounding and shift
+      in0 = _mm_adds_epi16(in0, final_rounding);
+      in1 = _mm_adds_epi16(in1, final_rounding);
+      in2 = _mm_adds_epi16(in2, final_rounding);
+      in3 = _mm_adds_epi16(in3, final_rounding);
+      in4 = _mm_adds_epi16(in4, final_rounding);
+      in5 = _mm_adds_epi16(in5, final_rounding);
+      in6 = _mm_adds_epi16(in6, final_rounding);
+      in7 = _mm_adds_epi16(in7, final_rounding);
+      in8 = _mm_adds_epi16(in8, final_rounding);
+      in9 = _mm_adds_epi16(in9, final_rounding);
+      in10 = _mm_adds_epi16(in10, final_rounding);
+      in11 = _mm_adds_epi16(in11, final_rounding);
+      in12 = _mm_adds_epi16(in12, final_rounding);
+      in13 = _mm_adds_epi16(in13, final_rounding);
+      in14 = _mm_adds_epi16(in14, final_rounding);
+      in15 = _mm_adds_epi16(in15, final_rounding);
+      in16 = _mm_adds_epi16(in16, final_rounding);
+      in17 = _mm_adds_epi16(in17, final_rounding);
+      in18 = _mm_adds_epi16(in18, final_rounding);
+      in19 = _mm_adds_epi16(in19, final_rounding);
+      in20 = _mm_adds_epi16(in20, final_rounding);
+      in21 = _mm_adds_epi16(in21, final_rounding);
+      in22 = _mm_adds_epi16(in22, final_rounding);
+      in23 = _mm_adds_epi16(in23, final_rounding);
+      in24 = _mm_adds_epi16(in24, final_rounding);
+      in25 = _mm_adds_epi16(in25, final_rounding);
+      in26 = _mm_adds_epi16(in26, final_rounding);
+      in27 = _mm_adds_epi16(in27, final_rounding);
+      in28 = _mm_adds_epi16(in28, final_rounding);
+      in29 = _mm_adds_epi16(in29, final_rounding);
+      in30 = _mm_adds_epi16(in30, final_rounding);
+      in31 = _mm_adds_epi16(in31, final_rounding);
+
+      in0 = _mm_srai_epi16(in0, 6);
+      in1 = _mm_srai_epi16(in1, 6);
+      in2 = _mm_srai_epi16(in2, 6);
+      in3 = _mm_srai_epi16(in3, 6);
+      in4 = _mm_srai_epi16(in4, 6);
+      in5 = _mm_srai_epi16(in5, 6);
+      in6 = _mm_srai_epi16(in6, 6);
+      in7 = _mm_srai_epi16(in7, 6);
+      in8 = _mm_srai_epi16(in8, 6);
+      in9 = _mm_srai_epi16(in9, 6);
+      in10 = _mm_srai_epi16(in10, 6);
+      in11 = _mm_srai_epi16(in11, 6);
+      in12 = _mm_srai_epi16(in12, 6);
+      in13 = _mm_srai_epi16(in13, 6);
+      in14 = _mm_srai_epi16(in14, 6);
+      in15 = _mm_srai_epi16(in15, 6);
+      in16 = _mm_srai_epi16(in16, 6);
+      in17 = _mm_srai_epi16(in17, 6);
+      in18 = _mm_srai_epi16(in18, 6);
+      in19 = _mm_srai_epi16(in19, 6);
+      in20 = _mm_srai_epi16(in20, 6);
+      in21 = _mm_srai_epi16(in21, 6);
+      in22 = _mm_srai_epi16(in22, 6);
+      in23 = _mm_srai_epi16(in23, 6);
+      in24 = _mm_srai_epi16(in24, 6);
+      in25 = _mm_srai_epi16(in25, 6);
+      in26 = _mm_srai_epi16(in26, 6);
+      in27 = _mm_srai_epi16(in27, 6);
+      in28 = _mm_srai_epi16(in28, 6);
+      in29 = _mm_srai_epi16(in29, 6);
+      in30 = _mm_srai_epi16(in30, 6);
+      in31 = _mm_srai_epi16(in31, 6);
+
+      RECON_AND_STORE(dest, in0);
+      RECON_AND_STORE(dest, in1);
+      RECON_AND_STORE(dest, in2);
+      RECON_AND_STORE(dest, in3);
+      RECON_AND_STORE(dest, in4);
+      RECON_AND_STORE(dest, in5);
+      RECON_AND_STORE(dest, in6);
+      RECON_AND_STORE(dest, in7);
+      RECON_AND_STORE(dest, in8);
+      RECON_AND_STORE(dest, in9);
+      RECON_AND_STORE(dest, in10);
+      RECON_AND_STORE(dest, in11);
+      RECON_AND_STORE(dest, in12);
+      RECON_AND_STORE(dest, in13);
+      RECON_AND_STORE(dest, in14);
+      RECON_AND_STORE(dest, in15);
+      RECON_AND_STORE(dest, in16);
+      RECON_AND_STORE(dest, in17);
+      RECON_AND_STORE(dest, in18);
+      RECON_AND_STORE(dest, in19);
+      RECON_AND_STORE(dest, in20);
+      RECON_AND_STORE(dest, in21);
+      RECON_AND_STORE(dest, in22);
+      RECON_AND_STORE(dest, in23);
+      RECON_AND_STORE(dest, in24);
+      RECON_AND_STORE(dest, in25);
+      RECON_AND_STORE(dest, in26);
+      RECON_AND_STORE(dest, in27);
+      RECON_AND_STORE(dest, in28);
+      RECON_AND_STORE(dest, in29);
+      RECON_AND_STORE(dest, in30);
+      RECON_AND_STORE(dest, in31);
+
+      dest += 8 - (stride * 32);
+    }
+  }
+}
+
+void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
+                                 int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
 
@@ -3042,336 +3701,7 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
                     in28, in29, in30, in31);
     }
 
-    // Stage1
-    {
-      const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31);
-      const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31);
-      const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15);
-      const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15);
-
-      const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23);
-      const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23);
-      const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7);
-      const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7);
-
-      const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27);
-      const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27);
-      const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11);
-      const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11);
-
-      const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19);
-      const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19);
-      const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3);
-      const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3);
-
-      MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,
-                             stg1_1, stg1_2, stg1_3, stp1_16, stp1_31,
-                             stp1_17, stp1_30)
-      MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4,
-                             stg1_5, stg1_6, stg1_7, stp1_18, stp1_29,
-                             stp1_19, stp1_28)
-      MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,
-                             stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,
-                             stp1_21, stp1_26)
-      MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,
-                             stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,
-                             stp1_23, stp1_24)
-    }
-
-    // Stage2
-    {
-      const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30);
-      const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30);
-      const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14);
-      const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14);
-
-      const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22);
-      const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22);
-      const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6);
-      const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6);
-
-      MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,
-                             stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,
-                             stp2_14)
-      MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,
-                             stg2_5, stg2_6, stg2_7, stp2_10, stp2_13,
-                             stp2_11, stp2_12)
-
-      stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
-      stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
-      stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
-      stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
-
-      stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
-      stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
-      stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
-      stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
-
-      stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
-      stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
-      stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
-      stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
-
-      stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
-      stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
-      stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
-      stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
-    }
-
-    // Stage3
-    {
-      const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28);
-      const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28);
-      const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12);
-      const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12);
-
-      const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
-      const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
-      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
-      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
-
-      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
-      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
-      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
-
-      MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,
-                             stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,
-                             stp1_6)
-
-      stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
-      stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
-      stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
-      stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
-      stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
-      stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
-      stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
-      stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
-
-      MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
-                             stg3_5, stg3_6, stg3_4, stp1_17, stp1_30,
-                             stp1_18, stp1_29)
-      MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
-                             stg3_9, stg3_10, stg3_8, stp1_21, stp1_26,
-                             stp1_22, stp1_25)
-
-      stp1_16 = stp2_16;
-      stp1_31 = stp2_31;
-      stp1_19 = stp2_19;
-      stp1_20 = stp2_20;
-      stp1_23 = stp2_23;
-      stp1_24 = stp2_24;
-      stp1_27 = stp2_27;
-      stp1_28 = stp2_28;
-    }
-
-    // Stage4
-    {
-      const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16);
-      const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16);
-      const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24);
-      const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24);
-
-      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
-      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
-
-      MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0,
-                             stg4_1, stg4_2, stg4_3, stp2_0, stp2_1,
-                             stp2_2, stp2_3)
-
-      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
-      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,
-                             stg4_5, stg4_6, stg4_4, stp2_9, stp2_14,
-                             stp2_10, stp2_13)
-
-      stp2_8 = stp1_8;
-      stp2_15 = stp1_15;
-      stp2_11 = stp1_11;
-      stp2_12 = stp1_12;
-
-      stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
-      stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
-      stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
-      stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
-      stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
-      stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
-      stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
-      stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
-
-      stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
-      stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
-      stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
-      stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
-      stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
-      stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
-      stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
-      stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
-    }
-
-    // Stage5
-    {
-      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
-      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
-      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
-      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
-
-      const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
-      const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
-      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
-      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
-
-      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
-      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-
-      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
-      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
-      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
-      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
-
-      tmp0 = _mm_add_epi32(tmp0, rounding);
-      tmp1 = _mm_add_epi32(tmp1, rounding);
-      tmp2 = _mm_add_epi32(tmp2, rounding);
-      tmp3 = _mm_add_epi32(tmp3, rounding);
-
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1);
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3);
-
-      stp1_4 = stp2_4;
-      stp1_7 = stp2_7;
-
-      stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
-      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
-      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
-      stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
-      stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
-      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
-      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
-      stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
-
-      stp1_16 = stp2_16;
-      stp1_17 = stp2_17;
-
-      MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
-                             stg4_5, stg4_4, stg4_5, stp1_18, stp1_29,
-                             stp1_19, stp1_28)
-      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
-                             stg4_4, stg4_6, stg4_4, stp1_20, stp1_27,
-                             stp1_21, stp1_26)
-
-      stp1_22 = stp2_22;
-      stp1_23 = stp2_23;
-      stp1_24 = stp2_24;
-      stp1_25 = stp2_25;
-      stp1_30 = stp2_30;
-      stp1_31 = stp2_31;
-    }
-
-    // Stage6
-    {
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
-      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
-
-      stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
-      stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
-      stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
-      stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
-      stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
-      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
-      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
-      stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
-
-      stp2_8 = stp1_8;
-      stp2_9 = stp1_9;
-      stp2_14 = stp1_14;
-      stp2_15 = stp1_15;
-
-      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12,
-                             stg6_0, stg4_0, stg6_0, stg4_0, stp2_10,
-                             stp2_13, stp2_11, stp2_12)
-
-      stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
-      stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
-      stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
-      stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
-      stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
-      stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
-      stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
-      stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
-
-      stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
-      stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
-      stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
-      stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
-      stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
-      stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
-      stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
-      stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
-    }
-
-    // Stage7
-    {
-      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
-      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
-      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
-      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-
-      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
-      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
-      const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
-      const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
-
-      stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
-      stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
-      stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
-      stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
-      stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
-      stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
-      stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
-      stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
-      stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
-      stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
-      stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
-      stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
-      stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
-      stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
-      stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
-      stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
-
-      stp1_16 = stp2_16;
-      stp1_17 = stp2_17;
-      stp1_18 = stp2_18;
-      stp1_19 = stp2_19;
-
-      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
-                             stg4_0, stg6_0, stg4_0, stp1_20, stp1_27,
-                             stp1_21, stp1_26)
-      MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
-                             stg4_0, stg6_0, stg4_0, stp1_22, stp1_25,
-                             stp1_23, stp1_24)
-
-      stp1_28 = stp2_28;
-      stp1_29 = stp2_29;
-      stp1_30 = stp2_30;
-      stp1_31 = stp2_31;
-    }
+    IDCT32_1D
 
     // final stage
     if (i < 4) {
@@ -3548,4 +3878,52 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
       dest += 8 - (stride * 32);
     }
   }
+}  //NOLINT
+
+void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a, i;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 6);
+
+  dc_value = _mm_set1_epi16(a);
+
+  for (i = 0; i < 4; ++i) {
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    dest += 8 - (stride * 32);
+  }
 }
diff --git a/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm b/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
index 980b8b9..69b07f6 100644
--- a/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
+++ b/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
@@ -19,12 +19,14 @@ pw_32: times 8 dw 32
 SECTION .text
 
 INIT_MMX sse
-cglobal dc_predictor_4x4, 4, 4, 2, dst, stride, above, left
+cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
   pxor                  m1, m1
   movd                  m0, [aboveq]
   punpckldq             m0, [leftq]
   psadbw                m0, m1
-  paddw                 m0, [pw_4]
+  paddw                 m0, [GLOBAL(pw_4)]
   psraw                 m0, 3
   pshufw                m0, m0, 0x0
   packuswb              m0, m0
@@ -33,10 +35,14 @@ cglobal dc_predictor_4x4, 4, 4, 2, dst, stride, above, left
   lea                 dstq, [dstq+strideq*2]
   movd      [dstq        ], m0
   movd      [dstq+strideq], m0
+
+  RESTORE_GOT
   RET
 
 INIT_MMX sse
-cglobal dc_predictor_8x8, 4, 4, 3, dst, stride, above, left
+cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
   pxor                  m1, m1
   movq                  m0, [aboveq]
   movq                  m2, [leftq]
@@ -45,7 +51,7 @@ cglobal dc_predictor_8x8, 4, 4, 3, dst, stride, above, left
   psadbw                m0, m1
   psadbw                m2, m1
   paddw                 m0, m2
-  paddw                 m0, [pw_8]
+  paddw                 m0, [GLOBAL(pw_8)]
   psraw                 m0, 4
   pshufw                m0, m0, 0x0
   packuswb              m0, m0
@@ -58,10 +64,14 @@ cglobal dc_predictor_8x8, 4, 4, 3, dst, stride, above, left
   movq    [dstq+strideq  ], m0
   movq    [dstq+strideq*2], m0
   movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
   RET
 
 INIT_XMM sse2
-cglobal dc_predictor_16x16, 4, 4, 3, dst, stride, above, left
+cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
   pxor                  m1, m1
   mova                  m0, [aboveq]
   mova                  m2, [leftq]
@@ -73,7 +83,7 @@ cglobal dc_predictor_16x16, 4, 4, 3, dst, stride, above, left
   paddw                 m0, m2
   movhlps               m2, m0
   paddw                 m0, m2
-  paddw                 m0, [pw_16]
+  paddw                 m0, [GLOBAL(pw_16)]
   psraw                 m0, 5
   pshuflw               m0, m0, 0x0
   punpcklqdq            m0, m0
@@ -86,10 +96,14 @@ cglobal dc_predictor_16x16, 4, 4, 3, dst, stride, above, left
   lea                 dstq, [dstq+strideq*4]
   dec              lines4d
   jnz .loop
+
+  RESTORE_GOT
   REP_RET
 
 INIT_XMM sse2
-cglobal dc_predictor_32x32, 4, 4, 5, dst, stride, above, left
+cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
   pxor                  m1, m1
   mova                  m0, [aboveq]
   mova                  m2, [aboveq+16]
@@ -107,7 +121,7 @@ cglobal dc_predictor_32x32, 4, 4, 5, dst, stride, above, left
   paddw                 m0, m4
   movhlps               m2, m0
   paddw                 m0, m2
-  paddw                 m0, [pw_32]
+  paddw                 m0, [GLOBAL(pw_32)]
   psraw                 m0, 6
   pshuflw               m0, m0, 0x0
   punpcklqdq            m0, m0
@@ -124,6 +138,8 @@ cglobal dc_predictor_32x32, 4, 4, 5, dst, stride, above, left
   lea                 dstq, [dstq+strideq*4]
   dec              lines4d
   jnz .loop
+
+  RESTORE_GOT
   REP_RET
 
 INIT_MMX sse
diff --git a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
index 8ba26f3..88df9b2 100644
--- a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
+++ b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -13,27 +13,23 @@
 SECTION_RODATA
 
 pb_1: times 16 db 1
-pw_2: times 8 dw 2
-pb_7m1: times 8 db 7, -1
-pb_15: times 16 db 15
-
-sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7
-sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7
+sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
 sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
 sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b2w01234577: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 7, -1, 7, -1
-sh_b2w12345677: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1
-sh_b2w23456777: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1, 7, -1
-sh_b2w01234567: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1
-sh_b2w12345678: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1
-sh_b2w23456789: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1
-sh_b2w89abcdef: db 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1
-sh_b2w9abcdeff: db 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1
-sh_b2wabcdefff: db 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1, 15, -1
 sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
 sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
+sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 
 SECTION .text
 
@@ -112,14 +108,16 @@ cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
   REP_RET
 
 INIT_MMX ssse3
-cglobal d45_predictor_4x4, 3, 3, 4, dst, stride, above
+cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
   movq                m0, [aboveq]
-  pshufb              m2, m0, [sh_b23456777]
-  pshufb              m1, m0, [sh_b01234577]
-  pshufb              m0, [sh_b12345677]
+  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
+  pshufb              m1, m0, [GLOBAL(sh_b01234577)]
+  pshufb              m0, [GLOBAL(sh_b12345677)]
   pavgb               m3, m2, m1
   pxor                m2, m1
-  pand                m2, [pb_1]
+  pand                m2, [GLOBAL(pb_1)]
   psubb               m3, m2
   pavgb               m0, m3
 
@@ -132,19 +130,23 @@ cglobal d45_predictor_4x4, 3, 3, 4, dst, stride, above
   movd    [dstq        ], m0
   psrlq               m0, 8
   movd    [dstq+strideq], m0
+
+  RESTORE_GOT
   RET
 
 INIT_MMX ssse3
-cglobal d45_predictor_8x8, 3, 3, 4, dst, stride, above
+cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
   movq                m0, [aboveq]
-  mova                m1, [sh_b12345677]
-  DEFINE_ARGS dst, stride, stride3, line
+  mova                m1, [GLOBAL(sh_b12345677)]
+  DEFINE_ARGS dst, stride, stride3
   lea           stride3q, [strideq*3]
-  pshufb              m2, m0, [sh_b23456777]
+  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
   pavgb               m3, m2, m0
   pxor                m2, m0
   pshufb              m0, m1
-  pand                m2, [pb_1]
+  pand                m2, [GLOBAL(pb_1)]
   psubb               m3, m2
   pavgb               m0, m3
 
@@ -167,20 +169,24 @@ cglobal d45_predictor_8x8, 3, 3, 4, dst, stride, above
   movq  [dstq+strideq*2], m0
   pshufb              m0, m1
   movq  [dstq+stride3q ], m0
+
+  RESTORE_GOT
   RET
 
 INIT_XMM ssse3
-cglobal d45_predictor_16x16, 3, 5, 4, dst, stride, above, dst8, line
+cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
+  GET_GOT     goffsetq
+
   mova                   m0, [aboveq]
   DEFINE_ARGS dst, stride, stride3, dst8, line
   lea              stride3q, [strideq*3]
   lea                 dst8q, [dstq+strideq*8]
-  mova                   m1, [sh_b123456789abcdeff]
-  pshufb                 m2, m0, [sh_b23456789abcdefff]
+  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb                 m2, m0, [GLOBAL(sh_b23456789abcdefff)]
   pavgb                  m3, m2, m0
   pxor                   m2, m0
   pshufb                 m0, m1
-  pand                   m2, [pb_1]
+  pand                   m2, [GLOBAL(pb_1)]
   psubb                  m3, m2
   pavgb                  m0, m3
 
@@ -214,29 +220,33 @@ cglobal d45_predictor_16x16, 3, 5, 4, dst, stride, above, dst8, line
   movhps [dstq+strideq  +8], m0
   movhps [dstq+strideq*2+8], m0
   movhps [dstq+stride3q +8], m0
+
+  RESTORE_GOT
   RET
 
 INIT_XMM ssse3
-cglobal d45_predictor_32x32, 3, 5, 7, dst, stride, above, dst16, line
+cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
+  GET_GOT     goffsetq
+
   mova                   m0, [aboveq]
   mova                   m4, [aboveq+16]
   DEFINE_ARGS dst, stride, stride3, dst16, line
   lea              stride3q, [strideq*3]
   lea                dst16q, [dstq  +strideq*8]
   lea                dst16q, [dst16q+strideq*8]
-  mova                   m1, [sh_b123456789abcdeff]
-  pshufb                 m2, m4, [sh_b23456789abcdefff]
+  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb                 m2, m4, [GLOBAL(sh_b23456789abcdefff)]
   pavgb                  m3, m2, m4
   pxor                   m2, m4
   palignr                m5, m4, m0, 1
   palignr                m6, m4, m0, 2
   pshufb                 m4, m1
-  pand                   m2, [pb_1]
+  pand                   m2, [GLOBAL(pb_1)]
   psubb                  m3, m2
   pavgb                  m4, m3
   pavgb                  m3, m0, m6
   pxor                   m0, m6
-  pand                   m0, [pb_1]
+  pand                   m0, [GLOBAL(pb_1)]
   psubb                  m3, m0
   pavgb                  m5, m3
 
@@ -288,4 +298,739 @@ cglobal d45_predictor_32x32, 3, 5, 7, dst, stride, above, dst16, line
   mova [dstq  +strideq  +16], m4
   mova [dstq  +strideq*2+16], m4
   mova [dstq  +stride3q +16], m4
+
+  RESTORE_GOT
+  RET
+
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+  pavgb               %4, %1, %3
+  pxor                %3, %1
+  pand                %3, [GLOBAL(pb_1)]
+  psubb               %4, %3
+  pavgb               %4, %2
+%endmacro
+
+INIT_XMM ssse3
+cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                m3, [aboveq]
+  pshufb              m1, m3, [GLOBAL(sh_b23456777)]
+  pshufb              m2, m3, [GLOBAL(sh_b12345677)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
+  pavgb               m3, m2
+
+  ; store 4 lines
+  movd    [dstq        ], m3
+  movd    [dstq+strideq], m4
+  lea               dstq, [dstq+strideq*2]
+  psrldq              m3, 1
+  psrldq              m4, 1
+  movd    [dstq        ], m3
+  movd    [dstq+strideq], m4
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                m3, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
+  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
+  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
+  pshufb              m3, [GLOBAL(sh_b0123456777777777)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
+  pavgb               m3, m2
+
+  ; store 4 lines
+  movq    [dstq        ], m3
+  movq    [dstq+strideq], m4
+  psrldq              m3, 1
+  psrldq              m4, 1
+  movq  [dstq+strideq*2], m3
+  movq  [dstq+stride3q ], m4
+  lea               dstq, [dstq+strideq*4]
+  psrldq              m3, 1
+  psrldq              m4, 1
+
+  ; store 4 lines
+  movq    [dstq        ], m3
+  movq    [dstq+strideq], m4
+  psrldq              m3, 1
+  psrldq              m4, 1
+  movq  [dstq+strideq*2], m3
+  movq  [dstq+stride3q ], m4
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
+  GET_GOT     goffsetq
+
+  mova                m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, line
+  lea           stride3q, [strideq*3]
+  mova                m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+  pshufb              m3, m0, m1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
+  pavgb               m0, m3
+
+  mov              lined, 4
+.loop:
+  mova  [dstq          ], m0
+  mova  [dstq+strideq  ], m4
+  pshufb              m0, m1
+  pshufb              m4, m1
+  mova  [dstq+strideq*2], m0
+  mova  [dstq+stride3q ], m4
+  pshufb              m0, m1
+  pshufb              m4, m1
+  lea               dstq, [dstq+strideq*4]
+  dec              lined
+  jnz .loop
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
+  GET_GOT     goffsetq
+
+  mova                   m0, [aboveq]
+  mova                   m7, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, line
+  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
+  lea              stride3q, [strideq*3]
+  pshufb                 m2, m7, [GLOBAL(sh_b23456789abcdefff)]
+  pshufb                 m3, m7, m1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
+  palignr                m6, m7, m0, 1
+  palignr                m5, m7, m0, 2
+  pavgb                  m7, m3
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
+  pavgb                  m0, m6
+
+  mov                 lined, 8
+.loop:
+  mova  [dstq             ], m0
+  mova  [dstq          +16], m7
+  mova  [dstq+strideq     ], m2
+  mova  [dstq+strideq  +16], m4
+  palignr                m3, m7, m0, 1
+  palignr                m5, m4, m2, 1
+  pshufb                 m7, m1
+  pshufb                 m4, m1
+
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m5
+  mova  [dstq+stride3q +16], m4
+  palignr                m0, m7, m3, 1
+  palignr                m2, m4, m5, 1
+  pshufb                 m7, m1
+  pshufb                 m4, m1
+  lea                  dstq, [dstq+strideq*4]
+  dec                 lined
+  jnz .loop
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  movd                m0, [leftq]               ; l1, l2, l3, l4
+  movd                m1, [aboveq-1]            ; tl, t1, t2, t3
+  punpckldq           m0, m1                    ; l1, l2, l3, l4, tl, t1, t2, t3
+  pshufb              m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
+  psrldq              m1, m0, 1                 ; l3, l2, l1, tl, t1, t2, t3
+  psrldq              m2, m0, 2                 ; l2, l1, tl, t1, t2, t3
+  ; comments below are for a predictor like this
+  ; A1 B1 C1 D1
+  ; A2 B2 A1 B1
+  ; A3 B3 A2 B2
+  ; A4 B4 A3 B3
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3  ; 3-tap avg B4 B3 B2 B1 C1 D1
+  pavgb               m1, m0                    ; 2-tap avg A4 A3 A2 A1
+
+  punpcklqdq          m3, m1                    ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
+
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  pshufb              m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
+  movd  [dstq+stride3q ], m3
+  psrldq              m3, 2                     ; A3 B3 A2 B2 A1 B1 C1 D1 ..
+  movd  [dstq+strideq*2], m3
+  psrldq              m3, 2                     ; A2 B2 A1 B1 C1 D1 ..
+  movd  [dstq+strideq  ], m3
+  psrldq              m3, 2                     ; A1 B1 C1 D1 ..
+  movd  [dstq          ], m3
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  movq                m0, [leftq]                     ; [0- 7] l1-8 [byte]
+  movhps              m0, [aboveq-1]                  ; [8-15] tl, t1-7 [byte]
+  pshufb              m1, m0, [GLOBAL(sh_b76543210)]  ; l8-1 [word]
+  pshufb              m2, m0, [GLOBAL(sh_b65432108)]  ; l7-1,tl [word]
+  pshufb              m3, m0, [GLOBAL(sh_b54321089)]  ; l6-1,tl,t1 [word]
+  pshufb              m0, [GLOBAL(sh_b89abcdef)]      ; tl,t1-7 [word]
+  psrldq              m4, m0, 1                       ; t1-7 [word]
+  psrldq              m5, m0, 2                       ; t2-7 [word]
+  ; comments below are for a predictor like this
+  ; A1 B1 C1 D1 E1 F1 G1 H1
+  ; A2 B2 A1 B1 C1 D1 E1 F1
+  ; A3 B3 A2 B2 A1 B1 C1 D1
+  ; A4 B4 A3 B3 A2 B2 A1 B1
+  ; A5 B5 A4 B4 A3 B3 A2 B2
+  ; A6 B6 A5 B5 A4 B4 A3 B3
+  ; A7 B7 A6 B6 A5 B5 A4 B4
+  ; A8 B8 A7 B7 A6 B6 A5 B5
+  pavgb               m6, m1, m2                ; 2-tap avg A8-A1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7  ; 3-tap avg C-H1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0  ; 3-tap avg B8-1
+
+  punpcklbw           m6, m0                    ; A-B8, A-B7 ... A-B2, A-B1
+
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+
+  movhps [dstq+stride3q], m6                    ; A-B4, A-B3, A-B2, A-B1
+  palignr             m0, m7, m6, 10            ; A-B3, A-B2, A-B1, C-H1
+  movq  [dstq+strideq*2], m0
+  psrldq              m0, 2                     ; A-B2, A-B1, C-H1
+  movq  [dstq+strideq  ], m0
+  psrldq              m0, 2                     ; A-H1
+  movq  [dstq          ], m0
+  lea               dstq, [dstq+strideq*4]
+  movq  [dstq+stride3q ], m6                    ; A-B8, A-B7, A-B6, A-B5
+  psrldq              m6, 2                     ; A-B7, A-B6, A-B5, A-B4
+  movq  [dstq+strideq*2], m6
+  psrldq              m6, 2                     ; A-B6, A-B5, A-B4, A-B3
+  movq  [dstq+strideq  ], m6
+  psrldq              m6, 2                     ; A-B5, A-B4, A-B3, A-B2
+  movq  [dstq          ], m6
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  mova                m0, [leftq]
+  movu                m7, [aboveq-1]
+  ; comments below are for a predictor like this
+  ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
+  ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
+  ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
+  ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
+  ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
+  ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
+  ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
+  ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
+  ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
+  ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
+  ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
+  ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
+  ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
+  ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
+  ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
+  ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
+  pshufb              m6, m7, [GLOBAL(sh_bfedcba9876543210)]
+  palignr             m5, m0, m6, 15
+  palignr             m3, m0, m6, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
+  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)]
+  pavgb               m5, m0                            ; A1 - Ag
+
+  punpcklbw           m0, m4, m5                        ; A-B8 ... A-B1
+  punpckhbw           m4, m5                            ; A-B9 ... A-Bg
+
+  pshufb              m3, m7, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb              m5, m7, [GLOBAL(sh_b23456789abcdefff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg C1-P1
+
+  pshufb              m6, m0, [GLOBAL(sh_bfedcba9876543210)]
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  palignr             m2, m1, m6, 14
+  mova  [dstq          ], m2
+  palignr             m2, m1, m6, 12
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m1, m6, 10
+  mova  [dstq+strideq*2], m2
+  palignr             m2, m1, m6, 8
+  mova  [dstq+stride3q ], m2
+  lea               dstq, [dstq+strideq*4]
+  palignr             m2, m1, m6, 6
+  mova  [dstq          ], m2
+  palignr             m2, m1, m6, 4
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m1, m6, 2
+  mova  [dstq+strideq*2], m2
+  pshufb              m4, [GLOBAL(sh_bfedcba9876543210)]
+  mova  [dstq+stride3q ], m6
+  lea               dstq, [dstq+strideq*4]
+
+  palignr             m2, m6, m4, 14
+  mova  [dstq          ], m2
+  palignr             m2, m6, m4, 12
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m6, m4, 10
+  mova  [dstq+strideq*2], m2
+  palignr             m2, m6, m4, 8
+  mova  [dstq+stride3q ], m2
+  lea               dstq, [dstq+strideq*4]
+  palignr             m2, m6, m4, 6
+  mova  [dstq          ], m2
+  palignr             m2, m6, m4, 4
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m6, m4, 2
+  mova  [dstq+strideq*2], m2
+  mova  [dstq+stride3q ], m4
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  mova                  m0, [leftq]
+  movu                  m7, [aboveq-1]
+  movu                  m1, [aboveq+15]
+
+  pshufb                m4, m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb                m6, m1, [GLOBAL(sh_b23456789abcdefff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2          ; 3-tap avg above [high]
+
+  palignr               m3, m1, m7, 1
+  palignr               m5, m1, m7, 2
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg above [low]
+
+  pshufb                m7, [GLOBAL(sh_bfedcba9876543210)]
+  palignr               m5, m0, m7, 15
+  palignr               m3, m0, m7, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
+  pavgb                 m5, m0                            ; A1 - Ag
+  punpcklbw             m6, m4, m5                        ; A-B8 ... A-B1
+  punpckhbw             m4, m5                            ; A-B9 ... A-Bg
+  pshufb                m6, [GLOBAL(sh_bfedcba9876543210)]
+  pshufb                m4, [GLOBAL(sh_bfedcba9876543210)]
+
+  DEFINE_ARGS dst, stride, stride3, left, line
+  lea             stride3q, [strideq*3]
+
+  palignr               m5, m2, m1, 14
+  palignr               m7, m1, m6, 14
+  mova  [dstq            ], m7
+  mova  [dstq+16         ], m5
+  palignr               m5, m2, m1, 12
+  palignr               m7, m1, m6, 12
+  mova  [dstq+strideq    ], m7
+  mova  [dstq+strideq+16 ], m5
+  palignr                m5, m2, m1, 10
+  palignr                m7, m1, m6, 10
+  mova  [dstq+strideq*2   ], m7
+  mova  [dstq+strideq*2+16], m5
+  palignr                m5, m2, m1, 8
+  palignr                m7, m1, m6, 8
+  mova  [dstq+stride3q    ], m7
+  mova  [dstq+stride3q+16 ], m5
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m5, m2, m1, 6
+  palignr                m7, m1, m6, 6
+  mova  [dstq             ], m7
+  mova  [dstq+16          ], m5
+  palignr                m5, m2, m1, 4
+  palignr                m7, m1, m6, 4
+  mova  [dstq+strideq     ], m7
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m2, m1, 2
+  palignr                m7, m1, m6, 2
+  mova  [dstq+strideq*2   ], m7
+  mova  [dstq+strideq*2+16], m5
+  mova  [dstq+stride3q    ], m6
+  mova  [dstq+stride3q+16 ], m1
+  lea                  dstq, [dstq+strideq*4]
+
+  palignr                m5, m1, m6, 14
+  palignr                m3, m6, m4, 14
+  mova  [dstq             ], m3
+  mova  [dstq+16          ], m5
+  palignr                m5, m1, m6, 12
+  palignr                m3, m6, m4, 12
+  mova  [dstq+strideq     ], m3
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m1, m6, 10
+  palignr                m3, m6, m4, 10
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m5
+  palignr                m5, m1, m6, 8
+  palignr                m3, m6, m4, 8
+  mova  [dstq+stride3q    ], m3
+  mova  [dstq+stride3q+16 ], m5
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m5, m1, m6, 6
+  palignr                m3, m6, m4, 6
+  mova  [dstq             ], m3
+  mova  [dstq+16          ], m5
+  palignr                m5, m1, m6, 4
+  palignr                m3, m6, m4, 4
+  mova  [dstq+strideq     ], m3
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m1, m6, 2
+  palignr                m3, m6, m4, 2
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m5
+  mova  [dstq+stride3q    ], m4
+  mova  [dstq+stride3q+16 ], m6
+  lea               dstq, [dstq+strideq*4]
+
+  mova                   m7, [leftq]
+  mova                   m3, [leftq+16]
+  palignr                m5, m3, m7, 15
+  palignr                m0, m3, m7, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2          ; 3-tap avg Bh -
+  pavgb                  m5, m3                            ; Ah -
+  punpcklbw              m3, m2, m5                        ; A-B8 ... A-B1
+  punpckhbw              m2, m5                            ; A-B9 ... A-Bg
+  pshufb                 m3, [GLOBAL(sh_bfedcba9876543210)]
+  pshufb                 m2, [GLOBAL(sh_bfedcba9876543210)]
+
+  palignr                m7, m6, m4, 14
+  palignr                m0, m4, m3, 14
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m6, m4, 12
+  palignr                m0, m4, m3, 12
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m6, m4, 10
+  palignr                m0, m4, m3, 10
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  palignr                m7, m6, m4, 8
+  palignr                m0, m4, m3, 8
+  mova  [dstq+stride3q    ], m0
+  mova  [dstq+stride3q+16 ], m7
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m7, m6, m4, 6
+  palignr                m0, m4, m3, 6
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m6, m4, 4
+  palignr                m0, m4, m3, 4
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m6, m4, 2
+  palignr                m0, m4, m3, 2
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m3
+  mova  [dstq+stride3q+16 ], m4
+  lea                  dstq, [dstq+strideq*4]
+
+  palignr                m7, m4, m3, 14
+  palignr                m0, m3, m2, 14
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m4, m3, 12
+  palignr                m0, m3, m2, 12
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m4, m3, 10
+  palignr                m0, m3, m2, 10
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  palignr                m7, m4, m3, 8
+  palignr                m0, m3, m2, 8
+  mova  [dstq+stride3q    ], m0
+  mova  [dstq+stride3q+16 ], m7
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m7, m4, m3, 6
+  palignr                m0, m3, m2, 6
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m4, m3, 4
+  palignr                m0, m3, m2, 4
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m4, m3, 2
+  palignr                m0, m3, m2, 2
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m2
+  mova  [dstq+stride3q+16 ], m3
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX ssse3
+cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset
+  GET_GOT     goffsetq
+  movd                m0, [leftq]                ; abcd [byte]
+  pshufb              m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
+  pshufb              m3, m0, [GLOBAL(sh_b2333)] ; cddd
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
+  pavgb               m1, m0             ; ab, bc, cd, d [byte]
+
+  punpcklbw           m1, m2             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
+  movd    [dstq        ], m1
+  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
+  movd    [dstq+strideq], m1
+  lea               dstq, [dstq+strideq*2]
+  psrlq               m1, 16             ; cd, c3d, d, d
+  movd    [dstq        ], m1
+  pshufw              m1, m1, q1111      ; d, d, d, d
+  movd    [dstq+strideq], m1
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
+  GET_GOT     goffsetq
+  movq                m3, [leftq]            ; abcdefgh [byte]
+  lea           stride3q, [strideq*3]
+
+  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
+  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
+  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
+  pavgb               m0, m2
+  punpcklbw           m0, m3        ; interleaved output
+
+  movq  [dstq          ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq  ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq*2], m0
+  psrldq              m0, 2
+  movq  [dstq+stride3q ], m0
+  lea               dstq, [dstq+strideq*4]
+  pshufhw             m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
+  psrldq              m0, 2
+  movq  [dstq          ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq  ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq*2], m0
+  psrldq              m0, 2
+  movq  [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
+  GET_GOT     goffsetq
+  lea           stride3q, [strideq*3]
+  mova                m0, [leftq]            ; abcdefghijklmnop [byte]
+  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
+  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+  pavgb               m1, m0                 ; ab, bc, cd .. no, op, pp [byte]
+
+  punpckhbw           m4, m1, m3    ; interleaved input
+  punpcklbw           m1, m3        ; interleaved output
+  mova  [dstq          ], m1
+  palignr             m3, m4, m1, 2
+  mova  [dstq+strideq  ], m3
+  palignr             m3, m4, m1, 4
+  mova  [dstq+strideq*2], m3
+  palignr             m3, m4, m1, 6
+  mova  [dstq+stride3q ], m3
+  lea               dstq, [dstq+strideq*4]
+  palignr             m3, m4, m1, 8
+  mova  [dstq          ], m3
+  palignr             m3, m4, m1, 10
+  mova  [dstq+strideq  ], m3
+  palignr             m3, m4, m1, 12
+  mova  [dstq+strideq*2], m3
+  palignr             m3, m4, m1, 14
+  mova  [dstq+stride3q ], m3
+  DEFINE_ARGS dst, stride, stride3, line
+  mov              lined, 2
+  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
+.loop:
+  lea               dstq, [dstq+strideq*4]
+  mova  [dstq          ], m4
+  pshufb              m4, m0
+  mova  [dstq+strideq  ], m4
+  pshufb              m4, m0
+  mova  [dstq+strideq*2], m4
+  pshufb              m4, m0
+  mova  [dstq+stride3q ], m4
+  pshufb              m4, m0
+  dec              lined
+  jnz .loop
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
+  GET_GOT     goffsetq
+  lea           stride3q, [strideq*3]
+  mova                m1, [leftq]              ;  0-15 [byte]
+  mova                m2, [leftq+16]           ; 16-31 [byte]
+  pshufb              m0, m2, [GLOBAL(sh_b23456789abcdefff)]
+  pshufb              m4, m2, [GLOBAL(sh_b123456789abcdeff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
+  palignr             m6, m2, m1, 1
+  palignr             m5, m2, m1, 2
+  pavgb               m2, m4         ; high 16px even lines
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
+  pavgb                   m1, m6         ; low 16px even lines
+
+  punpckhbw               m6, m1, m0               ; interleaved output 2
+  punpcklbw               m1, m0                   ; interleaved output 1
+
+  punpckhbw               m7, m2, m3               ; interleaved output 4
+  punpcklbw               m2, m3                   ; interleaved output 3
+
+  ; output 1st 8 lines (and half of 2nd 8 lines)
+  DEFINE_ARGS dst, stride, stride3, dst8
+  lea                  dst8q, [dstq+strideq*8]
+  mova  [dstq              ], m1
+  mova  [dstq           +16], m6
+  mova  [dst8q             ], m6
+  palignr             m0, m6, m1, 2
+  palignr             m4, m2, m6, 2
+  mova  [dstq +strideq     ], m0
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m0, m6, m1, 4
+  palignr             m4, m2, m6, 4
+  mova  [dstq +strideq*2   ], m0
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m0, m6, m1, 6
+  palignr             m4, m2, m6, 6
+  mova  [dstq +stride3q    ], m0
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq +strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+  palignr             m0, m6, m1, 8
+  palignr             m4, m2, m6, 8
+  mova  [dstq              ], m0
+  mova  [dstq           +16], m4
+  mova  [dst8q             ], m4
+  palignr             m0, m6, m1, 10
+  palignr             m4, m2, m6, 10
+  mova  [dstq +strideq     ], m0
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m0, m6, m1, 12
+  palignr             m4, m2, m6, 12
+  mova  [dstq +strideq*2   ], m0
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m0, m6, m1, 14
+  palignr             m4, m2, m6, 14
+  mova  [dstq +stride3q    ], m0
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+
+  ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
+  mova  [dstq           +16], m2
+  mova  [dst8q             ], m2
+  palignr             m4, m7, m2, 2
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m4, m7, m2, 4
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m4, m7, m2, 6
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+  palignr             m4, m7, m2, 8
+  mova  [dstq           +16], m4
+  mova  [dst8q             ], m4
+  palignr             m4, m7, m2, 10
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m4, m7, m2, 12
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m4, m7, m2, 14
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+
+  ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
+  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
+  mova  [dstq           +16], m7
+  mova  [dst8q             ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq  +16], m7
+  mova  [dst8q+strideq     ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq*2+16], m7
+  mova  [dst8q+strideq*2   ], m7
+  pshufb              m7, m0
+  mova  [dstq +stride3q +16], m7
+  mova  [dst8q+stride3q    ], m7
+  pshufb              m7, m0
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+  mova  [dstq           +16], m7
+  mova  [dst8q             ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq  +16], m7
+  mova  [dst8q+strideq     ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq*2+16], m7
+  mova  [dst8q+strideq*2   ], m7
+  pshufb              m7, m0
+  mova  [dstq +stride3q +16], m7
+  mova  [dst8q+stride3q    ], m7
+  pshufb              m7, m0
+  lea               dstq, [dstq+strideq*4]
+
+  ; output last half of 4th 8 lines
+  mova  [dstq           +16], m7
+  mova  [dstq +strideq  +16], m7
+  mova  [dstq +strideq*2+16], m7
+  mova  [dstq +stride3q +16], m7
+  lea               dstq, [dstq+strideq*4]
+  mova  [dstq           +16], m7
+  mova  [dstq +strideq  +16], m7
+  mova  [dstq +strideq*2+16], m7
+  mova  [dstq +stride3q +16], m7
+
+  ; done!
+  RESTORE_GOT
   RET
diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
new file mode 100644
index 0000000..3c5cb8f
--- /dev/null
+++ b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
@@ -0,0 +1,943 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  /* AVX2 */
+
+static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+    __m128i abs_p1p0;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
+    q4p4 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
+    q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
+    q3p3 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
+    q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
+    q2p2 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
+    q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
+    q1p1 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
+    p1q1 = _mm_shuffle_epi32(q1p1, 78);
+    q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
+    q0p0 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
+    p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+    {
+        __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+        abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+                _mm_subs_epu8(q0p0, q1p1));
+        abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+        fe = _mm_set1_epi8(0xfe);
+        ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+                _mm_subs_epu8(p0q0, q0p0));
+        abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+                _mm_subs_epu8(p1q1, q1p1));
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(abs_p1p0, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+                        _mm_subs_epu8(q1p1, q2p2)),
+                _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+                        _mm_subs_epu8(q2p2, q3p3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i t1 = _mm_set1_epi16(0x1);
+        __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+        __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+        __m128i qs0 = _mm_xor_si128(p0q0, t80);
+        __m128i qs1 = _mm_xor_si128(p1q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+        __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+        filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, qs0ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        filter1 = _mm_unpacklo_epi8(zero, filter1);
+        filter1 = _mm_srai_epi16(filter1, 0xB);
+        filter2 = _mm_unpacklo_epi8(zero, filter2);
+        filter2 = _mm_srai_epi16(filter2, 0xB);
+
+        /* Filter1 >> 3 */
+        filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+        qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi16(filter1, t1);
+        filt = _mm_srai_epi16(filt, 1);
+        filt = _mm_andnot_si128(
+                _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
+        filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+        qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            flat = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+                            _mm_subs_epu8(q0p0, q2p2)),
+                    _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+                            _mm_subs_epu8(q0p0, q3p3)));
+            flat = _mm_max_epu8(abs_p1p0, flat);
+            flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
+            q5p5 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q5p5),
+                            (__m64 *) (s + 5 * p)));
+
+            q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
+            q6p6 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q6p6),
+                            (__m64 *) (s + 6 * p)));
+
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+                            _mm_subs_epu8(q0p0, q4p4)),
+                    _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+                            _mm_subs_epu8(q0p0, q5p5)));
+
+            q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
+            q7p7 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q7p7),
+                            (__m64 *) (s + 7 * p)));
+
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+                            _mm_subs_epu8(q0p0, q6p6)),
+                    _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+                            _mm_subs_epu8(q0p0, q7p7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m128i eight = _mm_set1_epi16(8);
+            const __m128i four = _mm_set1_epi16(4);
+            __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+            __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+            __m128i pixelFilter_p, pixelFilter_q;
+            __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+            __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+            p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+            p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+            p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+            p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+            p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+            p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+            p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+            p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+            q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+            q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+            q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+            q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+            q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+            q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+            q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+            q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+            pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+                    _mm_add_epi16(p4_16, p3_16));
+            pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+                    _mm_add_epi16(q4_16, q3_16));
+
+            pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
+                    _mm_add_epi16(p2_16, p1_16));
+            pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
+                    _mm_add_epi16(q2_16, q1_16));
+            pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+            pixelFilter_p = _mm_add_epi16(eight,
+                    _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+            pixetFilter_p2p1p0 = _mm_add_epi16(four,
+                    _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
+                    4);
+            flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(p3_16, p0_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(q3_16, q0_16)), 3);
+
+            flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(p7_16, p7_16);
+            sum_q7 = _mm_add_epi16(q7_16, q7_16);
+            sum_p3 = _mm_add_epi16(p3_16, p3_16);
+            sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
+                    4);
+            flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p1_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q1_16)), 3);
+            flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+            sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
+                    4);
+            flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p2_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q2_16)), 3);
+            flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
+                    4);
+            flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
+                    4);
+            flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
+                    4);
+            flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
+                    4);
+            flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+        }
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        flat = _mm_shuffle_epi32(flat, 68);
+        flat2 = _mm_shuffle_epi32(flat2, 68);
+
+        q2p2 = _mm_andnot_si128(flat, q2p2);
+        flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+        qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+        flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+        q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+        qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+        flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+        q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+        q6p6 = _mm_andnot_si128(flat2, q6p6);
+        flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+        q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+        _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
+        _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
+
+        q5p5 = _mm_andnot_si128(flat2, q5p5);
+        flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+        q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+        _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
+        _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
+
+        q4p4 = _mm_andnot_si128(flat2, q4p4);
+        flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+        q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+        _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
+        _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
+
+        q3p3 = _mm_andnot_si128(flat2, q3p3);
+        flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+        q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+        _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
+        _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
+
+        q2p2 = _mm_andnot_si128(flat2, q2p2);
+        flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+        _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
+        _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
+
+        q1p1 = _mm_andnot_si128(flat2, q1p1);
+        flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+        q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+        _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
+        _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
+
+        q0p0 = _mm_andnot_si128(flat2, q0p0);
+        flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+        q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+        _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
+        _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
+    }
+}
+
+static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i p7, p6, p5;
+    __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+    __m128i q5, q6, q7;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
+    p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
+    p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
+    p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
+    p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
+    q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
+    q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
+    q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
+    q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
+    q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+
+    {
+        const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                _mm_subs_epu8(p0, p1));
+        const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                _mm_subs_epu8(q0, q1));
+        const __m128i fe = _mm_set1_epi8(0xfe);
+        const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                _mm_subs_epu8(q0, p0));
+        __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                _mm_subs_epu8(q1, p1));
+        __m128i work;
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(flat, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+                _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+        mask = _mm_max_epu8(work, mask);
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+                _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i te0 = _mm_set1_epi8(0xe0);
+        const __m128i t1f = _mm_set1_epi8(0x1f);
+        const __m128i t1 = _mm_set1_epi8(0x1);
+        const __m128i t7f = _mm_set1_epi8(0x7f);
+
+        __m128i ps1 = _mm_xor_si128(p1, t80);
+        __m128i ps0 = _mm_xor_si128(p0, t80);
+        __m128i qs0 = _mm_xor_si128(q0, t80);
+        __m128i qs1 = _mm_xor_si128(q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+                flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
+                flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
+                flat_q2;
+
+        filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        /* Filter1 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter1);
+        filter1 = _mm_srli_epi16(filter1, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter1 = _mm_and_si128(filter1, t1f);
+        filter1 = _mm_or_si128(filter1, work_a);
+        qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+        /* Filter2 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter2);
+        filter2 = _mm_srli_epi16(filter2, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter2 = _mm_and_si128(filter2, t1f);
+        filter2 = _mm_or_si128(filter2, work_a);
+        ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi8(filter1, t1);
+        work_a = _mm_cmpgt_epi8(zero, filt);
+        filt = _mm_srli_epi16(filt, 1);
+        work_a = _mm_and_si128(work_a, t80);
+        filt = _mm_and_si128(filt, t7f);
+        filt = _mm_or_si128(filt, work_a);
+        filt = _mm_andnot_si128(hev, filt);
+        ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+        qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+                    _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+                    _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+                    _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
+            q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+                    _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
+            q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+                    _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+
+            p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
+            q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+                    _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m256i eight = _mm256_set1_epi16(8);
+            const __m256i four = _mm256_set1_epi16(4);
+            __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+                    q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+                    p256_0, q256_0;
+            __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+                    pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
+                    res_q;
+
+            p256_7 = _mm256_cvtepu8_epi16(p7);
+            p256_6 = _mm256_cvtepu8_epi16(p6);
+            p256_5 = _mm256_cvtepu8_epi16(p5);
+            p256_4 = _mm256_cvtepu8_epi16(p4);
+            p256_3 = _mm256_cvtepu8_epi16(p3);
+            p256_2 = _mm256_cvtepu8_epi16(p2);
+            p256_1 = _mm256_cvtepu8_epi16(p1);
+            p256_0 = _mm256_cvtepu8_epi16(p0);
+            q256_0 = _mm256_cvtepu8_epi16(q0);
+            q256_1 = _mm256_cvtepu8_epi16(q1);
+            q256_2 = _mm256_cvtepu8_epi16(q2);
+            q256_3 = _mm256_cvtepu8_epi16(q3);
+            q256_4 = _mm256_cvtepu8_epi16(q4);
+            q256_5 = _mm256_cvtepu8_epi16(q5);
+            q256_6 = _mm256_cvtepu8_epi16(q6);
+            q256_7 = _mm256_cvtepu8_epi16(q7);
+
+            pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+                    _mm256_add_epi16(p256_4, p256_3));
+            pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+                    _mm256_add_epi16(q256_4, q256_3));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
+                    _mm256_add_epi16(p256_2, p256_1));
+            pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
+                    _mm256_add_epi16(q256_2, q256_1));
+            pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+            pixelFilter_p = _mm256_add_epi16(eight,
+                    _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(four,
+                    _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(p256_7, p256_0)), 4);
+
+            flat2_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(q256_7, q256_0)), 4);
+
+            flat2_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(p256_3, p256_0)), 3);
+
+            flat_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(q256_3, q256_0)), 3);
+
+            flat_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+            flat2_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+            flat2_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_1)), 3);
+
+            flat_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_1)), 3);
+
+            flat_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+            flat2_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+            flat2_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_2)), 3);
+
+            flat_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_2)), 3);
+
+            flat_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+            flat2_p3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+            flat2_q3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+            flat2_p4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+            flat2_q4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+            flat2_p5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+            flat2_q5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+            flat2_p6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+            flat2_q6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+        }
+
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        p2 = _mm_andnot_si128(flat, p2);
+        flat_p2 = _mm_and_si128(flat, flat_p2);
+        p2 = _mm_or_si128(flat_p2, p2);
+
+        p1 = _mm_andnot_si128(flat, ps1);
+        flat_p1 = _mm_and_si128(flat, flat_p1);
+        p1 = _mm_or_si128(flat_p1, p1);
+
+        p0 = _mm_andnot_si128(flat, ps0);
+        flat_p0 = _mm_and_si128(flat, flat_p0);
+        p0 = _mm_or_si128(flat_p0, p0);
+
+        q0 = _mm_andnot_si128(flat, qs0);
+        flat_q0 = _mm_and_si128(flat, flat_q0);
+        q0 = _mm_or_si128(flat_q0, q0);
+
+        q1 = _mm_andnot_si128(flat, qs1);
+        flat_q1 = _mm_and_si128(flat, flat_q1);
+        q1 = _mm_or_si128(flat_q1, q1);
+
+        q2 = _mm_andnot_si128(flat, q2);
+        flat_q2 = _mm_and_si128(flat, flat_q2);
+        q2 = _mm_or_si128(flat_q2, q2);
+
+        p6 = _mm_andnot_si128(flat2, p6);
+        flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+        p6 = _mm_or_si128(flat2_p6, p6);
+        _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
+
+        p5 = _mm_andnot_si128(flat2, p5);
+        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+        p5 = _mm_or_si128(flat2_p5, p5);
+        _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
+
+        p4 = _mm_andnot_si128(flat2, p4);
+        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+        p4 = _mm_or_si128(flat2_p4, p4);
+        _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
+
+        p3 = _mm_andnot_si128(flat2, p3);
+        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+        p3 = _mm_or_si128(flat2_p3, p3);
+        _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
+
+        p2 = _mm_andnot_si128(flat2, p2);
+        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+        p2 = _mm_or_si128(flat2_p2, p2);
+        _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
+
+        p1 = _mm_andnot_si128(flat2, p1);
+        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+        p1 = _mm_or_si128(flat2_p1, p1);
+        _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
+
+        p0 = _mm_andnot_si128(flat2, p0);
+        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+        p0 = _mm_or_si128(flat2_p0, p0);
+        _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
+
+        q0 = _mm_andnot_si128(flat2, q0);
+        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+        q0 = _mm_or_si128(flat2_q0, q0);
+        _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
+
+        q1 = _mm_andnot_si128(flat2, q1);
+        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+        q1 = _mm_or_si128(flat2_q1, q1);
+        _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
+
+        q2 = _mm_andnot_si128(flat2, q2);
+        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+        q2 = _mm_or_si128(flat2_q2, q2);
+        _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
+
+        q3 = _mm_andnot_si128(flat2, q3);
+        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+        q3 = _mm_or_si128(flat2_q3, q3);
+        _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
+
+        q4 = _mm_andnot_si128(flat2, q4);
+        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+        q4 = _mm_or_si128(flat2_q4, q4);
+        _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
+
+        q5 = _mm_andnot_si128(flat2, q5);
+        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+        q5 = _mm_or_si128(flat2_q5, q5);
+        _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
+
+        q6 = _mm_andnot_si128(flat2, q6);
+        flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+        q6 = _mm_or_si128(flat2_q6, q6);
+        _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
+    }
+}
+
+void vp9_mb_lpf_horizontal_edge_w_avx2(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh, int count) {
+    if (count == 1)
+        mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
+    else
+        mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
+}
diff --git a/libvpx/vp9/common/x86/vp9_postproc_x86.h b/libvpx/vp9/common/x86/vp9_postproc_x86.h
index b0e8b18..8870215 100644
--- a/libvpx/vp9/common/x86/vp9_postproc_x86.h
+++ b/libvpx/vp9/common/x86/vp9_postproc_x86.h
@@ -61,4 +61,4 @@ extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt);
 #endif
 #endif
 
-#endif
+#endif  // VP9_COMMON_X86_VP9_POSTPROC_X86_H_
diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm b/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm
new file mode 100644
index 0000000..9dc8d0a
--- /dev/null
+++ b/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm
@@ -0,0 +1,987 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    psrldq      xmm7, 8
+    pshuflw     xmm4, xmm7, 0b              ;k4
+    pshuflw     xmm5, xmm7, 01010101b       ;k5
+    pshuflw     xmm6, xmm7, 10101010b       ;k6
+    pshuflw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklqdq  xmm0, xmm1
+    punpcklqdq  xmm2, xmm3
+    punpcklqdq  xmm5, xmm4
+    punpcklqdq  xmm6, xmm7
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm2
+    movdqa      k5k4, xmm5
+    movdqa      k6k7, xmm6
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpckldq   xmm0, xmm1                  ;two row in one register
+    punpckldq   xmm6, xmm7
+    punpckldq   xmm2, xmm3
+    punpckldq   xmm5, xmm4
+
+    punpcklbw   xmm0, zero                  ;unpack to word
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+
+    pmullw      xmm0, k0k1                  ;multiply the filter factors
+    pmullw      xmm6, k6k7
+    pmullw      xmm2, k2k3
+    pmullw      xmm5, k5k4
+
+    paddsw      xmm0, xmm6                  ;sum
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 8
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm2
+    psrldq      xmm2, 8
+    paddsw      xmm0, xmm5
+    psrldq      xmm5, 8
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshufhw     xmm4, xmm7, 0b              ;k4
+    pshufhw     xmm5, xmm7, 01010101b       ;k5
+    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklwd   xmm0, xmm0
+    punpcklwd   xmm1, xmm1
+    punpcklwd   xmm2, xmm2
+    punpcklwd   xmm3, xmm3
+    punpckhwd   xmm4, xmm4
+    punpckhwd   xmm5, xmm5
+    punpckhwd   xmm6, xmm6
+    punpckhwd   xmm7, xmm7
+
+    movdqa      k0,   xmm0                  ;store filter factors on stack
+    movdqa      k1,   xmm1
+    movdqa      k2,   xmm2
+    movdqa      k3,   xmm3
+    movdqa      k4,   xmm4
+    movdqa      k5,   xmm5
+    movdqa      k6,   xmm6
+    movdqa      k7,   xmm7
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6                   ;rounding
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+    movq        xmm0, [rsi + %1]            ;0
+    movq        xmm1, [rsi + rax + %1]      ;1
+    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
+    movq        xmm2, [rsi + rax + %1]      ;2
+    movq        xmm3, [rsi + rax * 2 + %1]  ;3
+    movq        xmm4, [rsi + rdx + %1]      ;4
+    movq        xmm5, [rsi + rax * 4 + %1]  ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+    punpcklbw   xmm0, zero
+    punpcklbw   xmm1, zero
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm7, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+    punpcklbw   xmm3, zero
+    punpcklbw   xmm4, zero
+
+    pmullw      xmm0, k0
+    pmullw      xmm1, k1
+    pmullw      xmm6, k6
+    pmullw      xmm7, k7
+    pmullw      xmm2, k2
+    pmullw      xmm5, k5
+    pmullw      xmm3, k3
+    pmullw      xmm4, k4
+
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm6
+    paddsw      xmm0, xmm7
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+    paddsw      xmm0, xmm3
+    paddsw      xmm0, xmm4
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+%if %1
+    movq        xmm1, [rdi + %2]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi + %2], xmm0
+%endm
+
+;void vp9_filter_block1d4_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
+sym(vp9_filter_block1d4_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movd        xmm0, [rsi]                 ;load src: row 0
+    movd        xmm1, [rsi + rax]           ;1
+    movd        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movd        xmm7, [rsi + rdx * 2]       ;7
+    movd        xmm2, [rsi + rax]           ;2
+    movd        xmm3, [rsi + rax * 2]       ;3
+    movd        xmm4, [rsi + rdx]           ;4
+    movd        xmm5, [rsi + rax * 4]       ;5
+
+    APPLY_FILTER_4 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d8_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
+sym(vp9_filter_block1d8_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
+sym(vp9_filter_block1d16_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 8
+    APPLY_FILTER_8 0, 8
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movd        xmm0, [rsi]                 ;load src: row 0
+    movd        xmm1, [rsi + rax]           ;1
+    movd        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movd        xmm7, [rsi + rdx * 2]       ;7
+    movd        xmm2, [rsi + rax]           ;2
+    movd        xmm3, [rsi + rax * 2]       ;3
+    movd        xmm4, [rsi + rdx]           ;4
+    movd        xmm5, [rsi + rax * 4]       ;5
+
+    APPLY_FILTER_4 1
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 1, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 1, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 8
+    APPLY_FILTER_8 1, 8
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d4_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
+sym(vp9_filter_block1d4_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm3, 3
+    psrldq      xmm5, 5
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_4 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d8_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
+sym(vp9_filter_block1d8_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
+sym(vp9_filter_block1d16_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    movdqu      xmm0,   [rsi + 5]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 8
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm3, 3
+    psrldq      xmm5, 5
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_4 1
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 1, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 1, 0
+
+    movdqu      xmm0,   [rsi + 5]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 1, 8
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index bbf9888..7a5cca0 100644
--- a/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -534,6 +534,21 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     ret
 
 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%macro HORIZx4_ROW 2
+    movdqa      %2,   %1
+    pshufb      %1,   [GLOBAL(shuf_t0t1)]
+    pshufb      %2,   [GLOBAL(shuf_t2t3)]
+    pmaddubsw   %1,   xmm6
+    pmaddubsw   %2,   xmm7
+
+    paddsw      %1,   %2
+    movdqa      %2,   %1
+    psrldq      %2,   8
+    paddsw      %1,   %2
+    paddsw      %1,   xmm5
+    psraw       %1,   7
+    packuswb    %1,   %1
+%endm
 
 %macro HORIZx4 1
     mov         rdx, arg(5)                 ;filter ptr
@@ -544,64 +559,84 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     movdqa      xmm4, [rdx]                 ;load filters
     movq        xmm5, rcx
     packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-    movdqa      krd, xmm5
+    pshuflw     xmm6, xmm4, 0b              ;k0_k1
+    pshufhw     xmm6, xmm6, 10101010b       ;k0_k1_k4_k5
+    pshuflw     xmm7, xmm4, 01010101b       ;k2_k3
+    pshufhw     xmm7, xmm7, 11111111b       ;k2_k3_k6_k7
+    pshufd      xmm5, xmm5, 0               ;rounding
 
     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
     movsxd      rdx, dword ptr arg(3)       ;output_pitch
     movsxd      rcx, dword ptr arg(4)       ;output_height
-
+    shr         rcx, 1
 .loop:
-    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
-
-    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-    punpcklqdq  xmm0,   xmm3
+    ;Do two rows once
+    movq        xmm0,   [rsi - 3]           ;load src
+    movq        xmm1,   [rsi + 5]
+    movq        xmm2,   [rsi + rax - 3]
+    movq        xmm3,   [rsi + rax + 5]
+    punpcklqdq  xmm0,   xmm1
+    punpcklqdq  xmm2,   xmm3
+
+    HORIZx4_ROW xmm0,   xmm1
+    HORIZx4_ROW xmm2,   xmm3
+%if %1
+    movd        xmm1,   [rdi]
+    pavgb       xmm0,   xmm1
+    movd        xmm3,   [rdi + rdx]
+    pavgb       xmm2,   xmm3
+%endif
+    movd        [rdi],  xmm0
+    movd        [rdi +rdx],  xmm2
 
-    movdqa      xmm1,   xmm0
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm0,   k0k1
+    lea         rsi,    [rsi + rax]
+    prefetcht0  [rsi + 4 * rax - 3]
+    lea         rsi,    [rsi + rax]
+    lea         rdi,    [rdi + 2 * rdx]
+    prefetcht0  [rsi + 2 * rax - 3]
 
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
+    dec         rcx
+    jnz         .loop
 
-    movdqa      xmm4,   xmm2
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
+    ; Do last row if output_height is odd
+    movsxd      rcx,    dword ptr arg(4)       ;output_height
+    and         rcx,    1
+    je          .done
 
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
+    movq        xmm0,   [rsi - 3]    ; load src
+    movq        xmm1,   [rsi + 5]
+    punpcklqdq  xmm0,   xmm1
 
-    paddsw      xmm0,   xmm1
-    paddsw      xmm0,   xmm4
-    paddsw      xmm0,   xmm2
-    paddsw      xmm0,   krd
-    psraw       xmm0,   7
-    packuswb    xmm0,   xmm0
+    HORIZx4_ROW xmm0, xmm1
 %if %1
     movd        xmm1,   [rdi]
     pavgb       xmm0,   xmm1
 %endif
-    lea         rsi,    [rsi + rax]
     movd        [rdi],  xmm0
+.done
+%endm
 
-    lea         rdi,    [rdi + rdx]
-    dec         rcx
-    jnz         .loop
+%macro HORIZx8_ROW 4
+    movdqa      %2,   %1
+    movdqa      %3,   %1
+    movdqa      %4,   %1
+
+    pshufb      %1,   [GLOBAL(shuf_t0t1)]
+    pshufb      %2,   [GLOBAL(shuf_t2t3)]
+    pshufb      %3,   [GLOBAL(shuf_t4t5)]
+    pshufb      %4,   [GLOBAL(shuf_t6t7)]
+
+    pmaddubsw   %1,   k0k1
+    pmaddubsw   %2,   k2k3
+    pmaddubsw   %3,   k4k5
+    pmaddubsw   %4,   k6k7
+
+    paddsw      %1,   %2
+    paddsw      %1,   %4
+    paddsw      %1,   %3
+    paddsw      %1,   krd
+    psraw       %1,   7
+    packuswb    %1,   %1
 %endm
 
 %macro HORIZx8 1
@@ -633,45 +668,51 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
     movsxd      rdx, dword ptr arg(3)       ;output_pitch
     movsxd      rcx, dword ptr arg(4)       ;output_height
+    shr         rcx, 1
 
 .loop:
-    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
-
-    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
+    movq        xmm0,   [rsi - 3]           ;load src
+    movq        xmm3,   [rsi + 5]
+    movq        xmm4,   [rsi + rax - 3]
+    movq        xmm7,   [rsi + rax + 5]
     punpcklqdq  xmm0,   xmm3
+    punpcklqdq  xmm4,   xmm7
 
-    movdqa      xmm1,   xmm0
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm0,   k0k1
+    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
+    HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
+%if %1
+    movq        xmm1,   [rdi]
+    movq        xmm2,   [rdi + rdx]
+    pavgb       xmm0,   xmm1
+    pavgb       xmm4,   xmm2
+%endif
+    movq        [rdi],  xmm0
+    movq        [rdi + rdx],  xmm4
 
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
+    lea         rsi,    [rsi + rax]
+    prefetcht0  [rsi + 4 * rax - 3]
+    lea         rsi,    [rsi + rax]
+    lea         rdi,    [rdi + 2 * rdx]
+    prefetcht0  [rsi + 2 * rax - 3]
+    dec         rcx
+    jnz         .loop
 
-    movdqa      xmm4,   xmm2
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
+    ;Do last row if output_height is odd
+    movsxd      rcx,    dword ptr arg(4)    ;output_height
+    and         rcx,    1
+    je          .done
 
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
+    movq        xmm0,   [rsi - 3]
+    movq        xmm3,   [rsi + 5]
+    punpcklqdq  xmm0,   xmm3
 
-    paddsw      xmm0,   xmm1
-    paddsw      xmm0,   xmm4
-    paddsw      xmm0,   xmm2
-    paddsw      xmm0,   krd
-    psraw       xmm0,   7
-    packuswb    xmm0,   xmm0
+    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
 %if %1
     movq        xmm1,   [rdi]
     pavgb       xmm0,   xmm1
 %endif
-
-    lea         rsi,    [rsi + rax]
     movq        [rdi],  xmm0
-
-    lea         rdi,    [rdi + rdx]
-    dec         rcx
-    jnz         .loop
+.done
 %endm
 
 %macro HORIZx16 1
@@ -705,60 +746,53 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     movsxd      rcx, dword ptr arg(4)       ;output_height
 
 .loop:
-    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
+    prefetcht0  [rsi + 2 * rax -3]
 
-    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-    punpcklqdq  xmm0,   xmm3
+    movq        xmm0,   [rsi - 3]           ;load src data
+    movq        xmm4,   [rsi + 5]
+    movq        xmm7,   [rsi + 13]
+    punpcklqdq  xmm0,   xmm4
+    punpcklqdq  xmm4,   xmm7
 
     movdqa      xmm1,   xmm0
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm0,   k0k1
+    movdqa      xmm2,   xmm0
+    movdqa      xmm3,   xmm0
+    movdqa      xmm5,   xmm4
+    movdqa      xmm6,   xmm4
+    movdqa      xmm7,   xmm4
 
-    movdqa      xmm2,   xmm1
+    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
     pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
-
-    movdqa      xmm4,   xmm2
     pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
+    pshufb      xmm3,   [GLOBAL(shuf_t6t7)]
+    pshufb      xmm4,   [GLOBAL(shuf_t0t1)]
+    pshufb      xmm5,   [GLOBAL(shuf_t2t3)]
+    pshufb      xmm6,   [GLOBAL(shuf_t4t5)]
+    pshufb      xmm7,   [GLOBAL(shuf_t6t7)]
 
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
+    pmaddubsw   xmm0,   k0k1
+    pmaddubsw   xmm1,   k2k3
+    pmaddubsw   xmm2,   k4k5
+    pmaddubsw   xmm3,   k6k7
+    pmaddubsw   xmm4,   k0k1
+    pmaddubsw   xmm5,   k2k3
+    pmaddubsw   xmm6,   k4k5
+    pmaddubsw   xmm7,   k6k7
 
     paddsw      xmm0,   xmm1
-    paddsw      xmm0,   xmm4
+    paddsw      xmm0,   xmm3
     paddsw      xmm0,   xmm2
+    paddsw      xmm4,   xmm5
+    paddsw      xmm4,   xmm7
+    paddsw      xmm4,   xmm6
+
     paddsw      xmm0,   krd
+    paddsw      xmm4,   krd
     psraw       xmm0,   7
+    psraw       xmm4,   7
     packuswb    xmm0,   xmm0
-
-
-    movq        xmm3,   [rsi +  5]
-    movq        xmm7,   [rsi + 13]
-    punpcklqdq  xmm3,   xmm7
-
-    movdqa      xmm1,   xmm3
-    pshufb      xmm3,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm3,   k0k1
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
-
-    movdqa      xmm4,   xmm2
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
-
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
-
-    paddsw      xmm3,   xmm1
-    paddsw      xmm3,   xmm4
-    paddsw      xmm3,   xmm2
-    paddsw      xmm3,   krd
-    psraw       xmm3,   7
-    packuswb    xmm3,   xmm3
-    punpcklqdq  xmm0,   xmm3
+    packuswb    xmm4,   xmm4
+    punpcklqdq  xmm0,   xmm4
 %if %1
     movdqa      xmm1,   [rdi]
     pavgb       xmm0,   xmm1
@@ -792,19 +826,8 @@ sym(vp9_filter_block1d4_h8_ssse3):
     push        rdi
     ; end prolog
 
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
     HORIZx4 0
 
-    add rsp, 16*5
-    pop rsp
-
     ; begin epilog
     pop rdi
     pop rsi
@@ -909,19 +932,8 @@ sym(vp9_filter_block1d4_h8_avg_ssse3):
     push        rdi
     ; end prolog
 
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
     HORIZx4 1
 
-    add rsp, 16*5
-    pop rsp
-
     ; begin epilog
     pop rdi
     pop rsi
diff --git a/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm b/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm
deleted file mode 100644
index 174e747..0000000
--- a/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm
+++ /dev/null
@@ -1,230 +0,0 @@
-;
-;   Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;   Use of this source code is governed by a BSD-style license
-;   that can be found in the LICENSE file in the root of the source
-;   tree. An additional intellectual property rights grant can be found
-;   in the file PATENTS.  All contributing project authors may
-;   be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT |vp9_add_constant_residual_8x8_neon|
-    EXPORT |vp9_add_constant_residual_16x16_neon|
-    EXPORT |vp9_add_constant_residual_32x32_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    MACRO
-    LD_16x8 $src, $stride
-    vld1.8              {q8},       [$src],     $stride
-    vld1.8              {q9},       [$src],     $stride
-    vld1.8              {q10},      [$src],     $stride
-    vld1.8              {q11},      [$src],     $stride
-    vld1.8              {q12},      [$src],     $stride
-    vld1.8              {q13},      [$src],     $stride
-    vld1.8              {q14},      [$src],     $stride
-    vld1.8              {q15},      [$src],     $stride
-    MEND
-
-    MACRO
-    ADD_DIFF_16x8 $diff
-    vqadd.u8            q8,         q8,         $diff
-    vqadd.u8            q9,         q9,         $diff
-    vqadd.u8            q10,        q10,        $diff
-    vqadd.u8            q11,        q11,        $diff
-    vqadd.u8            q12,        q12,        $diff
-    vqadd.u8            q13,        q13,        $diff
-    vqadd.u8            q14,        q14,        $diff
-    vqadd.u8            q15,        q15,        $diff
-    MEND
-
-    MACRO
-    SUB_DIFF_16x8 $diff
-    vqsub.u8            q8,         q8,         $diff
-    vqsub.u8            q9,         q9,         $diff
-    vqsub.u8            q10,        q10,        $diff
-    vqsub.u8            q11,        q11,        $diff
-    vqsub.u8            q12,        q12,        $diff
-    vqsub.u8            q13,        q13,        $diff
-    vqsub.u8            q14,        q14,        $diff
-    vqsub.u8            q15,        q15,        $diff
-    MEND
-
-    MACRO
-    ST_16x8 $dst, $stride
-    vst1.8              {q8},       [$dst],     $stride
-    vst1.8              {q9},       [$dst],     $stride
-    vst1.8              {q10},      [$dst],     $stride
-    vst1.8              {q11},      [$dst],     $stride
-    vst1.8              {q12},      [$dst],     $stride
-    vst1.8              {q13},      [$dst],     $stride
-    vst1.8              {q14},      [$dst],     $stride
-    vst1.8              {q15},      [$dst],     $stride
-    MEND
-
-; void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
-;                             int width, int height) {
-;  int r, c;
-;
-;  for (r = 0; r < height; r++) {
-;    for (c = 0; c < width; c++)
-;      dest[c] = clip_pixel(diff + dest[c]);
-;
-;    dest += stride;
-;  }
-;}
-;void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest,
-;                                     int stride) {
-;  add_constant_residual(diff, dest, stride, 8, 8);
-;}
-;       r0      : const int16_t diff
-;       r1      : const uint8_t *dest
-;       r2      : int stride
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp9_add_constant_residual_8x8_neon| PROC
-    mov                 r3,         r1                      ; r3: save dest to r3
-    vld1.8              {d0},       [r1],       r2
-    vld1.8              {d1},       [r1],       r2
-    vld1.8              {d2},       [r1],       r2
-    vld1.8              {d3},       [r1],       r2
-    vld1.8              {d4},       [r1],       r2
-    vld1.8              {d5},       [r1],       r2
-    vld1.8              {d6},       [r1],       r2
-    vld1.8              {d7},       [r1],       r2
-    cmp                 r0,         #0
-    bge                 DIFF_POSITIVE_8x8
-
-DIFF_NEGATIVE_8x8                                           ; diff < 0
-    neg                 r0,         r0
-    usat                r0,         #8,         r0
-    vdup.u8             q8,         r0
-
-    vqsub.u8            q0,         q0,         q8
-    vqsub.u8            q1,         q1,         q8
-    vqsub.u8            q2,         q2,         q8
-    vqsub.u8            q3,         q3,         q8
-    b                   DIFF_SAVE_8x8
-
-DIFF_POSITIVE_8x8                                           ; diff >= 0
-    usat                r0,         #8,         r0
-    vdup.u8             q8,         r0
-
-    vqadd.u8            q0,         q0,         q8
-    vqadd.u8            q1,         q1,         q8
-    vqadd.u8            q2,         q2,         q8
-    vqadd.u8            q3,         q3,         q8
-
-DIFF_SAVE_8x8
-    vst1.8              {d0},       [r3],       r2
-    vst1.8              {d1},       [r3],       r2
-    vst1.8              {d2},       [r3],       r2
-    vst1.8              {d3},       [r3],       r2
-    vst1.8              {d4},       [r3],       r2
-    vst1.8              {d5},       [r3],       r2
-    vst1.8              {d6},       [r3],       r2
-    vst1.8              {d7},       [r3],       r2
-
-    bx                  lr
-    ENDP
-
-;void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest,
-;                                       int stride) {
-;  add_constant_residual(diff, dest, stride, 16, 16);
-;}
-;       r0      : const int16_t diff
-;       r1      : const uint8_t *dest
-;       r2      : int stride
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp9_add_constant_residual_16x16_neon| PROC
-    mov                 r3,         r1
-    LD_16x8             r1,         r2
-    cmp                 r0,         #0
-    bge                 DIFF_POSITIVE_16x16
-
-|DIFF_NEGATIVE_16x16|
-    neg                 r0,         r0
-    usat                r0,         #8,         r0
-    vdup.u8             q0,         r0
-
-    SUB_DIFF_16x8       q0
-    ST_16x8             r3,         r2
-    LD_16x8             r1,         r2
-    SUB_DIFF_16x8       q0
-    b                   DIFF_SAVE_16x16
-
-|DIFF_POSITIVE_16x16|
-    usat                r0,         #8,         r0
-    vdup.u8             q0,         r0
-
-    ADD_DIFF_16x8       q0
-    ST_16x8             r3,         r2
-    LD_16x8             r1,         r2
-    ADD_DIFF_16x8       q0
-
-|DIFF_SAVE_16x16|
-    ST_16x8             r3,         r2
-    bx                  lr
-    ENDP
-
-;void vp9_add_constant_residual_32x32_c(const int16_t diff, uint8_t *dest,
-;                                       int stride) {
-;  add_constant_residual(diff, dest, stride, 32, 32);
-;}
-;       r0      : const int16_t diff
-;       r1      : const uint8_t *dest
-;       r2      : int stride
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp9_add_constant_residual_32x32_neon| PROC
-    push                {r4,lr}
-    pld                 [r1]
-    mov                 r3,         r1
-    add                 r4,         r1,         #16         ; r4 dest + 16 for second loop
-    cmp                 r0,         #0
-    bge                 DIFF_POSITIVE_32x32
-
-|DIFF_NEGATIVE_32x32|
-    neg                 r0,         r0
-    usat                r0,         #8,         r0
-    vdup.u8             q0,         r0
-    mov                 r0,         #4
-
-|DIFF_NEGATIVE_32x32_LOOP|
-    sub                 r0,         #1
-    LD_16x8             r1,         r2
-    SUB_DIFF_16x8       q0
-    ST_16x8             r3,         r2
-
-    LD_16x8             r1,         r2
-    SUB_DIFF_16x8       q0
-    ST_16x8             r3,         r2
-    cmp                 r0,         #2
-    moveq               r1,         r4
-    moveq               r3,         r4
-    cmp                 r0,         #0
-    bne                 DIFF_NEGATIVE_32x32_LOOP
-    pop                 {r4,pc}
-
-|DIFF_POSITIVE_32x32|
-    usat                r0,         #8,         r0
-    vdup.u8             q0,         r0
-    mov                 r0,         #4
-
-|DIFF_POSITIVE_32x32_LOOP|
-    sub                 r0,         #1
-    LD_16x8             r1,         r2
-    ADD_DIFF_16x8       q0
-    ST_16x8             r3,         r2
-
-    LD_16x8             r1,         r2
-    ADD_DIFF_16x8       q0
-    ST_16x8             r3,         r2
-    cmp                 r0,         #2
-    moveq               r1,         r4
-    moveq               r3,         r4
-    cmp                 r0,         #0
-    bne                 DIFF_POSITIVE_32x32_LOOP
-    pop                 {r4,pc}
-    ENDP
-
-    END
diff --git a/libvpx/vp9/decoder/vp9_dboolhuff.h b/libvpx/vp9/decoder/vp9_dboolhuff.h
index c864516..fd8e74c 100644
--- a/libvpx/vp9/decoder/vp9_dboolhuff.h
+++ b/libvpx/vp9/decoder/vp9_dboolhuff.h
@@ -44,7 +44,7 @@ static int vp9_read(vp9_reader *br, int probability) {
   VP9_BD_VALUE bigsplit;
   int count;
   unsigned int range;
-  unsigned int split = 1 + (((br->range - 1) * probability) >> 8);
+  unsigned int split = ((br->range * probability) + (256 - probability)) >> 8;
 
   if (br->count < 0)
     vp9_reader_fill(br);
diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c
index 84a29b1..9792d2c 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libvpx/vp9/decoder/vp9_decodemv.c
@@ -23,18 +23,36 @@
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decodframe.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
-#include "vp9/decoder/vp9_dsubexp.h"
 #include "vp9/decoder/vp9_treereader.h"
 
 static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
   return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p);
 }
 
+static MB_PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r,
+                                            int size_group) {
+  const MB_PREDICTION_MODE y_mode = read_intra_mode(r,
+                                        cm->fc.y_mode_prob[size_group]);
+  if (!cm->frame_parallel_decoding_mode)
+    ++cm->counts.y_mode[size_group][y_mode];
+  return y_mode;
+}
+
+static MB_PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r,
+                                             MB_PREDICTION_MODE y_mode) {
+  const MB_PREDICTION_MODE uv_mode = read_intra_mode(r,
+                                         cm->fc.uv_mode_prob[y_mode]);
+  if (!cm->frame_parallel_decoding_mode)
+    ++cm->counts.uv_mode[y_mode][uv_mode];
+  return uv_mode;
+}
+
 static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r,
                                           uint8_t context) {
-  MB_PREDICTION_MODE mode = treed_read(r, vp9_inter_mode_tree,
-                            cm->fc.inter_mode_probs[context]);
-  ++cm->counts.inter_mode[context][inter_mode_offset(mode)];
+  const MB_PREDICTION_MODE mode = treed_read(r, vp9_inter_mode_tree,
+                                             cm->fc.inter_mode_probs[context]);
+  if (!cm->frame_parallel_decoding_mode)
+    ++cm->counts.inter_mode[context][inter_mode_offset(mode)];
   return mode;
 }
 
@@ -53,33 +71,28 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
       tx_size += vp9_read(r, tx_probs[2]);
   }
 
-  update_tx_counts(bsize, context, tx_size, &cm->counts.tx);
+  if (!cm->frame_parallel_decoding_mode)
+    ++get_tx_counts(bsize, context, &cm->counts.tx)[tx_size];
   return tx_size;
 }
 
-static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode,
-                            BLOCK_SIZE bsize, int allow_select,
+static TX_SIZE read_tx_size(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                            TX_MODE tx_mode, BLOCK_SIZE bsize, int allow_select,
                             vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
-
-  if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
+  if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) {
     return read_selected_tx_size(cm, xd, bsize, r);
-  else if (tx_mode >= ALLOW_32X32 && bsize >= BLOCK_32X32)
-    return TX_32X32;
-  else if (tx_mode >= ALLOW_16X16 && bsize >= BLOCK_16X16)
-    return TX_16X16;
-  else if (tx_mode >= ALLOW_8X8 && bsize >= BLOCK_8X8)
-    return TX_8X8;
-  else
-    return TX_4X4;
+  } else {
+    const TX_SIZE max_tx_size_block = max_txsize_lookup[bsize];
+    const TX_SIZE max_tx_size_txmode = tx_mode_to_biggest_tx_size[tx_mode];
+    return MIN(max_tx_size_block, max_tx_size_txmode);
+  }
 }
 
 static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
                            int mi_row, int mi_col, int segment_id) {
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
   const int ymis = MIN(cm->mi_rows - mi_row, bh);
   int x, y;
@@ -91,11 +104,11 @@ static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
       cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
 }
 
-static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
+static int read_intra_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                 int mi_row, int mi_col,
                                  vp9_reader *r) {
-  MACROBLOCKD *const xd = &pbi->mb;
-  struct segmentation *const seg = &pbi->common.seg;
-  const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type;
+  struct segmentation *const seg = &cm->seg;
+  const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type;
   int segment_id;
 
   if (!seg->enabled)
@@ -105,16 +118,14 @@ static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
     return 0;
 
   segment_id = read_segment_id(r, seg);
-  set_segment_id(&pbi->common, bsize, mi_row, mi_col, segment_id);
+  set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
   return segment_id;
 }
 
-static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
-                                 vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
+static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                 int mi_row, int mi_col, vp9_reader *r) {
   struct segmentation *const seg = &cm->seg;
-  const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type;
   int pred_segment_id, segment_id;
 
   if (!seg->enabled)
@@ -138,37 +149,37 @@ static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
   return segment_id;
 }
 
-static uint8_t read_skip_coeff(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
-  int skip_coeff = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
-  if (!skip_coeff) {
+static int read_skip_coeff(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                           int segment_id, vp9_reader *r) {
+  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
     const int ctx = vp9_get_pred_context_mbskip(xd);
-    skip_coeff = vp9_read(r, vp9_get_pred_prob_mbskip(cm, xd));
-    cm->counts.mbskip[ctx][skip_coeff]++;
+    const int skip = vp9_read(r, cm->fc.mbskip_probs[ctx]);
+    if (!cm->frame_parallel_decoding_mode)
+      ++cm->counts.mbskip[ctx][skip];
+    return skip;
   }
-  return skip_coeff;
 }
 
-static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
+static void read_intra_frame_mode_info(VP9_COMMON *const cm,
+                                       MACROBLOCKD *const xd,
+                                       MODE_INFO *const m,
                                        int mi_row, int mi_col, vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
   MB_MODE_INFO *const mbmi = &m->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const MODE_INFO *above_mi = xd->mi_8x8[-cm->mode_info_stride];
-  const MODE_INFO *left_mi = xd->mi_8x8[-1];
+  const MODE_INFO *left_mi  = xd->left_available ? xd->mi_8x8[-1] : NULL;
 
-  mbmi->segment_id = read_intra_segment_id(pbi, mi_row, mi_col, r);
-  mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
-  mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r);
+  mbmi->segment_id = read_intra_segment_id(cm, xd, mi_row, mi_col, r);
+  mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r);
+  mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, bsize, 1, r);
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE;
 
   if (bsize >= BLOCK_8X8) {
     const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0);
-    const MB_PREDICTION_MODE L = xd->left_available ?
-                                  left_block_mode(m, left_mi, 0) : DC_PRED;
+    const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, 0);
     mbmi->mode = read_intra_mode(r, vp9_kf_y_mode_prob[A][L]);
   } else {
     // Only 4x4, 4x8, 8x4 blocks
@@ -180,8 +191,7 @@ static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int ib = idy * 2 + idx;
         const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, ib);
-        const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
-                                     left_block_mode(m, left_mi, ib) : DC_PRED;
+        const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, ib);
         const MB_PREDICTION_MODE b_mode = read_intra_mode(r,
                                               vp9_kf_y_mode_prob[A][L]);
         m->bmi[ib].as_mode = b_mode;
@@ -200,7 +210,6 @@ static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
 
 static int read_mv_component(vp9_reader *r,
                              const nmv_component *mvcomp, int usehp) {
-
   int mag, d, fr, hp;
   const int sign = vp9_read(r, mvcomp->sign);
   const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
@@ -251,56 +260,10 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
   mv->col = ref->col + diff.col;
 }
 
-static void update_mv(vp9_reader *r, vp9_prob *p) {
-  if (vp9_read(r, NMV_UPDATE_PROB))
-    *p = (vp9_read_literal(r, 7) << 1) | 1;
-}
-
-static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int allow_hp) {
-  int i, j, k;
-
-  for (j = 0; j < MV_JOINTS - 1; ++j)
-    update_mv(r, &mvc->joints[j]);
-
-  for (i = 0; i < 2; ++i) {
-    nmv_component *const comp = &mvc->comps[i];
-
-    update_mv(r, &comp->sign);
-
-    for (j = 0; j < MV_CLASSES - 1; ++j)
-      update_mv(r, &comp->classes[j]);
-
-    for (j = 0; j < CLASS0_SIZE - 1; ++j)
-      update_mv(r, &comp->class0[j]);
-
-    for (j = 0; j < MV_OFFSET_BITS; ++j)
-      update_mv(r, &comp->bits[j]);
-  }
-
-  for (i = 0; i < 2; ++i) {
-    nmv_component *const comp = &mvc->comps[i];
-
-    for (j = 0; j < CLASS0_SIZE; ++j)
-      for (k = 0; k < 3; ++k)
-        update_mv(r, &comp->class0_fp[j][k]);
-
-    for (j = 0; j < 3; ++j)
-      update_mv(r, &comp->fp[j]);
-  }
-
-  if (allow_hp) {
-    for (i = 0; i < 2; ++i) {
-      update_mv(r, &mvc->comps[i].class0_hp);
-      update_mv(r, &mvc->comps[i].hp);
-    }
-  }
-}
-
 // Read the referncence frame
-static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r,
+static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                            vp9_reader *r,
                             int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
   FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
 
@@ -313,7 +276,8 @@ static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r,
 
     if (cm->comp_pred_mode == HYBRID_PREDICTION) {
       is_comp = vp9_read(r, fc->comp_inter_prob[comp_ctx]);
-      counts->comp_inter[comp_ctx][is_comp]++;
+      if (!cm->frame_parallel_decoding_mode)
+        ++counts->comp_inter[comp_ctx][is_comp];
     } else {
       is_comp = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
     }
@@ -323,18 +287,21 @@ static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r,
       const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
       const int ref_ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
       const int b = vp9_read(r, fc->comp_ref_prob[ref_ctx]);
-      counts->comp_ref[ref_ctx][b]++;
+      if (!cm->frame_parallel_decoding_mode)
+        ++counts->comp_ref[ref_ctx][b];
       ref_frame[fix_ref_idx] = cm->comp_fixed_ref;
       ref_frame[!fix_ref_idx] = cm->comp_var_ref[b];
     } else {
       const int ctx0 = vp9_get_pred_context_single_ref_p1(xd);
       const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]);
-      ++counts->single_ref[ctx0][0][bit0];
+      if (!cm->frame_parallel_decoding_mode)
+        ++counts->single_ref[ctx0][0][bit0];
       if (bit0) {
         const int ctx1 = vp9_get_pred_context_single_ref_p2(xd);
         const int bit1 = vp9_read(r, fc->single_ref_prob[ctx1][1]);
         ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
-        ++counts->single_ref[ctx1][1][bit1];
+        if (!cm->frame_parallel_decoding_mode)
+          ++counts->single_ref[ctx1][1][bit1];
       } else {
         ref_frame[0] = LAST_FRAME;
       }
@@ -344,43 +311,19 @@ static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r,
   }
 }
 
-static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
-  int i, j;
-  for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j)
-    for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
-}
 
-static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
-  int i, j;
-  for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-    for (j = 0; j < INTER_MODES - 1; ++j)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
-}
-
-static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
-  COMPPREDMODE_TYPE mode = vp9_read_bit(r);
-  if (mode)
-    mode += vp9_read_bit(r);
-  return mode;
-}
-
-static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
-    VP9D_COMP *pbi, vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
+static INLINE INTERPOLATION_TYPE read_switchable_filter_type(
+    VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r) {
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   const int type = treed_read(r, vp9_switchable_interp_tree,
                               cm->fc.switchable_interp_prob[ctx]);
-  ++cm->counts.switchable_interp[ctx][type];
+  if (!cm->frame_parallel_decoding_mode)
+    ++cm->counts.switchable_interp[ctx][type];
   return type;
 }
 
-static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
-                                  vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
+static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi,
+                                       vp9_reader *r) {
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
 
@@ -388,9 +331,7 @@ static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
   mbmi->ref_frame[1] = NONE;
 
   if (bsize >= BLOCK_8X8) {
-    const int size_group = size_group_lookup[bsize];
-    mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[size_group]);
-    cm->counts.y_mode[size_group][mbmi->mode]++;
+    mbmi->mode = read_intra_mode_y(cm, r, size_group_lookup[bsize]);
   } else {
      // Only 4x4, 4x8, 8x4 blocks
      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
@@ -400,10 +341,8 @@ static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
      for (idy = 0; idy < 2; idy += num_4x4_h) {
        for (idx = 0; idx < 2; idx += num_4x4_w) {
          const int ib = idy * 2 + idx;
-         const int b_mode = read_intra_mode(r, cm->fc.y_mode_prob[0]);
+         const int b_mode = read_intra_mode_y(cm, r, 0);
          mi->bmi[ib].as_mode = b_mode;
-         cm->counts.y_mode[0][b_mode]++;
-
          if (num_4x4_h == 2)
            mi->bmi[ib + 2].as_mode = b_mode;
          if (num_4x4_w == 2)
@@ -413,55 +352,98 @@ static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
     mbmi->mode = mi->bmi[3].as_mode;
   }
 
-  mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
-  cm->counts.uv_mode[mbmi->mode][mbmi->uv_mode]++;
+  mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
 }
 
-static int read_is_inter_block(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
+static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
+                             int_mv mv[2], int_mv best_mv[2],
+                             int_mv nearest_mv[2], int_mv near_mv[2],
+                             int is_compound, int allow_hp, vp9_reader *r) {
+  int i;
+  int ret = 1;
+
+  switch (mode) {
+    case NEWMV: {
+      nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ?
+                                            NULL : &cm->counts.mv;
+      read_mv(r, &mv[0].as_mv, &best_mv[0].as_mv,
+              &cm->fc.nmvc, mv_counts, allow_hp);
+      if (is_compound)
+        read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv,
+                &cm->fc.nmvc, mv_counts, allow_hp);
+      for (i = 0; i < 1 + is_compound; ++i) {
+        ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW;
+        ret = ret && mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW;
+      }
+      break;
+    }
+    case NEARESTMV: {
+      mv[0].as_int = nearest_mv[0].as_int;
+      if (is_compound) mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEARMV: {
+      mv[0].as_int = near_mv[0].as_int;
+      if (is_compound) mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case ZEROMV: {
+      mv[0].as_int = 0;
+      if (is_compound) mv[1].as_int = 0;
+      break;
+    }
+    default: {
+      return 0;
+    }
+  }
+  return ret;
+}
 
+static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                               int segment_id, vp9_reader *r) {
   if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) !=
            INTRA_FRAME;
   } else {
     const int ctx = vp9_get_pred_context_intra_inter(xd);
     const int is_inter = vp9_read(r, vp9_get_pred_prob_intra_inter(cm, xd));
-    ++cm->counts.intra_inter[ctx][is_inter];
+    if (!cm->frame_parallel_decoding_mode)
+      ++cm->counts.intra_inter[ctx][is_inter];
     return is_inter;
   }
 }
 
-static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
+static void read_inter_block_mode_info(VP9_COMMON *const cm,
+                                       MACROBLOCKD *const xd,
+                                       const TileInfo *const tile,
+                                       MODE_INFO *const mi,
                                        int mi_row, int mi_col, vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
-  nmv_context *const nmvc = &cm->fc.nmvc;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-  int_mv *const mv0 = &mbmi->mv[0];
-  int_mv *const mv1 = &mbmi->mv[1];
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int allow_hp = xd->allow_high_precision_mv;
+  const int allow_hp = cm->allow_high_precision_mv;
 
-  int_mv nearest, nearby, best_mv;
-  int_mv nearest_second, nearby_second, best_mv_second;
+  int_mv nearest[2], nearmv[2], best[2];
   uint8_t inter_mode_ctx;
   MV_REFERENCE_FRAME ref0;
   int is_compound;
 
   mbmi->uv_mode = DC_PRED;
-  read_ref_frames(pbi, r, mbmi->segment_id, mbmi->ref_frame);
+  read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
   ref0 = mbmi->ref_frame[0];
   is_compound = has_second_ref(mbmi);
 
-  vp9_find_mv_refs(cm, xd, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0],
+  vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0],
                    mi_row, mi_col);
 
   inter_mode_ctx = mbmi->mode_context[ref0];
 
   if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
-    assert(bsize >= BLOCK_8X8);
+    if (bsize < BLOCK_8X8) {
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Invalid usage of segement feature on small blocks");
+        return;
+    }
   } else {
     if (bsize >= BLOCK_8X8)
       mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
@@ -469,222 +451,119 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
 
   // nearest, nearby
   if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
-    vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref0], &nearest, &nearby);
-    best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int;
+    vp9_find_best_ref_mvs(xd, allow_hp,
+                          mbmi->ref_mvs[ref0], &nearest[0], &nearmv[0]);
+    best[0].as_int = nearest[0].as_int;
   }
 
   if (is_compound) {
     const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
-    vp9_find_mv_refs(cm, xd, mi, xd->last_mi,
+    vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi,
                      ref1, mbmi->ref_mvs[ref1], mi_row, mi_col);
 
     if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
-      vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref1],
-                            &nearest_second, &nearby_second);
-      best_mv_second.as_int = mbmi->ref_mvs[ref1][0].as_int;
+      vp9_find_best_ref_mvs(xd, allow_hp,
+                            mbmi->ref_mvs[ref1], &nearest[1], &nearmv[1]);
+      best[1].as_int = nearest[1].as_int;
     }
   }
 
-  mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
-                              ? read_switchable_filter_type(pbi, r)
-                              : cm->mcomp_filter_type;
+  mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE)
+                      ? read_switchable_filter_type(cm, xd, r)
+                      : cm->mcomp_filter_type;
 
   if (bsize < BLOCK_8X8) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
     int idx, idy;
+    int b_mode;
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
-        int_mv blockmv, secondmv;
+        int_mv block[2];
         const int j = idy * 2 + idx;
-        const int b_mode = read_inter_mode(cm, r, inter_mode_ctx);
+        b_mode = read_inter_mode(cm, r, inter_mode_ctx);
 
         if (b_mode == NEARESTMV || b_mode == NEARMV) {
-          vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0,
+          vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[0],
+                                        &nearmv[0], j, 0,
                                         mi_row, mi_col);
 
           if (is_compound)
-            vp9_append_sub8x8_mvs_for_idx(cm, xd,  &nearest_second,
-                                         &nearby_second, j, 1,
-                                         mi_row, mi_col);
+            vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[1],
+                                          &nearmv[1], j, 1,
+                                          mi_row, mi_col);
         }
 
-        switch (b_mode) {
-          case NEWMV:
-            read_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
-                    &cm->counts.mv, allow_hp);
-
-            if (is_compound)
-              read_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
-                      &cm->counts.mv, allow_hp);
-            break;
-          case NEARESTMV:
-            blockmv.as_int = nearest.as_int;
-            if (is_compound)
-              secondmv.as_int = nearest_second.as_int;
-            break;
-          case NEARMV:
-            blockmv.as_int = nearby.as_int;
-            if (is_compound)
-              secondmv.as_int = nearby_second.as_int;
-            break;
-          case ZEROMV:
-            blockmv.as_int = 0;
-            if (is_compound)
-              secondmv.as_int = 0;
-            break;
-          default:
-            assert(!"Invalid inter mode value");
-        }
-        mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
+        if (!assign_mv(cm, b_mode, block, best, nearest, nearmv,
+                       is_compound, allow_hp, r)) {
+          xd->corrupted |= 1;
+          break;
+        };
+
+
+        mi->bmi[j].as_mv[0].as_int = block[0].as_int;
         if (is_compound)
-          mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
+          mi->bmi[j].as_mv[1].as_int = block[1].as_int;
 
         if (num_4x4_h == 2)
           mi->bmi[j + 2] = mi->bmi[j];
         if (num_4x4_w == 2)
           mi->bmi[j + 1] = mi->bmi[j];
-        mi->mbmi.mode = b_mode;
       }
     }
 
-    mv0->as_int = mi->bmi[3].as_mv[0].as_int;
-    mv1->as_int = mi->bmi[3].as_mv[1].as_int;
-  } else {
-    switch (mbmi->mode) {
-      case NEARMV:
-        mv0->as_int = nearby.as_int;
-        if (is_compound)
-          mv1->as_int = nearby_second.as_int;
-        break;
+    mi->mbmi.mode = b_mode;
 
-      case NEARESTMV:
-        mv0->as_int = nearest.as_int;
-        if (is_compound)
-          mv1->as_int = nearest_second.as_int;
-        break;
-
-      case ZEROMV:
-        mv0->as_int = 0;
-        if (is_compound)
-          mv1->as_int = 0;
-        break;
-
-      case NEWMV:
-        read_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->counts.mv, allow_hp);
-        if (is_compound)
-          read_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc, &cm->counts.mv,
-                  allow_hp);
-        break;
-      default:
-        assert(!"Invalid inter mode value");
-    }
+    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  } else {
+    xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv,
+                                best, nearest, nearmv,
+                                is_compound, allow_hp, r);
   }
 }
 
-static void read_inter_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
+static void read_inter_frame_mode_info(VP9_COMMON *const cm,
+                                       MACROBLOCKD *const xd,
+                                       const TileInfo *const tile,
+                                       MODE_INFO *const mi,
                                        int mi_row, int mi_col, vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   int inter_block;
 
   mbmi->mv[0].as_int = 0;
   mbmi->mv[1].as_int = 0;
-  mbmi->segment_id = read_inter_segment_id(pbi, mi_row, mi_col, r);
-  mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
-  inter_block = read_is_inter_block(pbi, mbmi->segment_id, r);
-  mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, mbmi->sb_type,
+  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
+  mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r);
+  inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
+  mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type,
                                !mbmi->skip_coeff || !inter_block, r);
 
   if (inter_block)
-    read_inter_block_mode_info(pbi, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
   else
-    read_intra_block_mode_info(pbi, mi, r);
+    read_intra_block_mode_info(cm, mi, r);
 }
 
-static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
-  int i;
-
-  cm->comp_pred_mode = cm->allow_comp_inter_inter ? read_comp_pred_mode(r)
-                                                  : SINGLE_PREDICTION_ONLY;
-
-  if (cm->comp_pred_mode == HYBRID_PREDICTION)
-    for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
-
-  if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
-    for (i = 0; i < REF_CONTEXTS; i++) {
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
-    }
-
-  if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
-    for (i = 0; i < REF_CONTEXTS; i++)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
-}
-
-void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
-  int k;
-
-  // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.
-  // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
-  for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-    if (vp9_read(r, MODE_UPDATE_PROB))
-      vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]);
-
-  if (cm->frame_type != KEY_FRAME && !cm->intra_only) {
-    nmv_context *const nmvc = &pbi->common.fc.nmvc;
-    MACROBLOCKD *const xd = &pbi->mb;
-    int i, j;
-
-    read_inter_mode_probs(&cm->fc, r);
-
-    if (cm->mcomp_filter_type == SWITCHABLE)
-      read_switchable_interp_probs(&cm->fc, r);
-
-    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]);
-
-    read_comp_pred(cm, r);
-
-    for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
-      for (i = 0; i < INTRA_MODES - 1; ++i)
-        if (vp9_read(r, MODE_UPDATE_PROB))
-          vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]);
-
-    for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j)
-      for (i = 0; i < PARTITION_TYPES - 1; ++i)
-        if (vp9_read(r, MODE_UPDATE_PROB))
-          vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]);
-
-    read_mv_probs(r, nmvc, xd->allow_high_precision_mv);
-  }
-}
-
-void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
-  MODE_INFO *mi = xd->this_mi;
+void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+                        const TileInfo *const tile,
+                        int mi_row, int mi_col, vp9_reader *r) {
+  MODE_INFO *const mi = xd->mi_8x8[0];
   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
   const int y_mis = MIN(bh, cm->mi_rows - mi_row);
   const int x_mis = MIN(bw, cm->mi_cols - mi_col);
   int x, y, z;
 
-  if (cm->frame_type == KEY_FRAME || cm->intra_only)
-    read_intra_frame_mode_info(pbi, mi, mi_row, mi_col, r);
+  if (frame_is_intra_only(cm))
+    read_intra_frame_mode_info(cm, xd, mi, mi_row, mi_col, r);
   else
-    read_inter_frame_mode_info(pbi, mi, mi_row, mi_col, r);
+    read_inter_frame_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
 
-  for (y = 0, z = 0; y < y_mis; y++, z += cm->mode_info_stride)
+  for (y = 0, z = 0; y < y_mis; y++, z += cm->mode_info_stride) {
     for (x = !y; x < x_mis; x++) {
-        xd->mi_8x8[z + x] = mi;
-      }
+      xd->mi_8x8[z + x] = mi;
+    }
+  }
 }
diff --git a/libvpx/vp9/decoder/vp9_decodemv.h b/libvpx/vp9/decoder/vp9_decodemv.h
index 462d2e3..8e9ae4a 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.h
+++ b/libvpx/vp9/decoder/vp9_decodemv.h
@@ -14,8 +14,10 @@
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_dboolhuff.h"
 
-void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r);
+struct TileInfo;
 
-void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r);
+void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+                        const struct TileInfo *const tile,
+                        int mi_row, int mi_col, vp9_reader *r);
 
 #endif  // VP9_DECODER_VP9_DECODEMV_H_
diff --git a/libvpx/vp9/decoder/vp9_decodframe.c b/libvpx/vp9/decoder/vp9_decodframe.c
index dbba28e..4746a3a 100644
--- a/libvpx/vp9/decoder/vp9_decodframe.c
+++ b/libvpx/vp9/decoder/vp9_decodframe.c
@@ -19,6 +19,7 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_extend.h"
+#include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconintra.h"
@@ -31,16 +32,49 @@
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_dsubexp.h"
-#include "vp9/decoder/vp9_idct_blk.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/decoder/vp9_thread.h"
 #include "vp9/decoder/vp9_treereader.h"
 
+typedef struct TileWorkerData {
+  VP9_COMMON *cm;
+  vp9_reader bit_reader;
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
+} TileWorkerData;
+
 static int read_be32(const uint8_t *p) {
   return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
 }
 
+static int is_compound_prediction_allowed(const VP9_COMMON *cm) {
+  int i;
+  for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i)
+    if  (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
+      return 1;
+
+  return 0;
+}
+
+static void setup_compound_prediction(VP9_COMMON *cm) {
+  if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+          cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
+    cm->comp_fixed_ref = ALTREF_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = GOLDEN_FRAME;
+  } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+                 cm->ref_frame_sign_bias[ALTREF_FRAME]) {
+    cm->comp_fixed_ref = GOLDEN_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  } else {
+    cm->comp_fixed_ref = LAST_FRAME;
+    cm->comp_var_ref[0] = GOLDEN_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  }
+}
+
 // len == 0 is not allowed
 static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
   return start + len > start && start + len <= end;
@@ -63,18 +97,105 @@ static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 3; ++j)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);
+      vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 2; ++j)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);
+      vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 1; ++j)
-      if (vp9_read(r, MODE_UPDATE_PROB))
-        vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
+      vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
+}
+
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
+  int i, j;
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+    for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
+      vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
+}
+
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
+  int i, j;
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+    for (j = 0; j < INTER_MODES - 1; ++j)
+      vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
+}
+
+static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
+  COMPPREDMODE_TYPE mode = vp9_read_bit(r);
+  if (mode)
+    mode += vp9_read_bit(r);
+  return mode;
+}
+
+static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
+  int i;
+
+  const int compound_allowed = is_compound_prediction_allowed(cm);
+  cm->comp_pred_mode = compound_allowed ? read_comp_pred_mode(r)
+                                        : SINGLE_PREDICTION_ONLY;
+  if (compound_allowed)
+    setup_compound_prediction(cm);
+
+  if (cm->comp_pred_mode == HYBRID_PREDICTION)
+    for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+      vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
+
+  if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
+    for (i = 0; i < REF_CONTEXTS; i++) {
+      vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
+      vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
+    }
+
+  if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+    for (i = 0; i < REF_CONTEXTS; i++)
+      vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
+}
+
+static void update_mv(vp9_reader *r, vp9_prob *p) {
+  if (vp9_read(r, NMV_UPDATE_PROB))
+    *p = (vp9_read_literal(r, 7) << 1) | 1;
+}
+
+static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int allow_hp) {
+  int i, j, k;
+
+  for (j = 0; j < MV_JOINTS - 1; ++j)
+    update_mv(r, &mvc->joints[j]);
+
+  for (i = 0; i < 2; ++i) {
+    nmv_component *const comp = &mvc->comps[i];
+
+    update_mv(r, &comp->sign);
+
+    for (j = 0; j < MV_CLASSES - 1; ++j)
+      update_mv(r, &comp->classes[j]);
+
+    for (j = 0; j < CLASS0_SIZE - 1; ++j)
+      update_mv(r, &comp->class0[j]);
+
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      update_mv(r, &comp->bits[j]);
+  }
+
+  for (i = 0; i < 2; ++i) {
+    nmv_component *const comp = &mvc->comps[i];
+
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      for (k = 0; k < 3; ++k)
+        update_mv(r, &comp->class0_fp[j][k]);
+
+    for (j = 0; j < 3; ++j)
+      update_mv(r, &comp->fp[j]);
+  }
+
+  if (allow_hp) {
+    for (i = 0; i < 2; ++i) {
+      update_mv(r, &mvc->comps[i].class0_hp);
+      update_mv(r, &mvc->comps[i].hp);
+    }
+  }
 }
 
 static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
@@ -85,47 +206,110 @@ static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
     xd->plane[i].dequant = cm->uv_dequant[q_index];
 }
 
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
-                         TX_SIZE tx_size, void *arg) {
-  MACROBLOCKD* const xd = arg;
+// Allocate storage for each tile column.
+// TODO(jzern): when max_threads <= 1 the same storage could be used for each
+// tile.
+static void alloc_tile_storage(VP9D_COMP *pbi, int tile_cols) {
+  VP9_COMMON *const cm = &pbi->common;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  int i, tile_col;
+
+  CHECK_MEM_ERROR(cm, pbi->mi_streams,
+                  vpx_realloc(pbi->mi_streams, tile_cols *
+                              sizeof(*pbi->mi_streams)));
+  for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+    TileInfo tile;
+
+    vp9_tile_init(&tile, cm, 0, tile_col);
+    pbi->mi_streams[tile_col] =
+        &cm->mi[cm->mi_rows * tile.mi_col_start];
+  }
+
+  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
+  // block where mi unit size is 8x8.
+  CHECK_MEM_ERROR(cm, pbi->above_context[0],
+                  vpx_realloc(pbi->above_context[0],
+                              sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+                              2 * aligned_mi_cols));
+  for (i = 1; i < MAX_MB_PLANE; ++i) {
+    pbi->above_context[i] = pbi->above_context[0] +
+                            i * sizeof(*pbi->above_context[0]) *
+                            2 * aligned_mi_cols;
+  }
+
+  // This is sized based on the entire frame. Each tile operates within its
+  // column bounds.
+  CHECK_MEM_ERROR(cm, pbi->above_seg_context,
+                  vpx_realloc(pbi->above_seg_context,
+                              sizeof(*pbi->above_seg_context) *
+                              aligned_mi_cols));
+}
+
+static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
+                                    BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+  int16_t* const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const int stride = pd->dst.stride;
   const int eob = pd->eobs[block];
-  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
-                                                       block);
-  uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
-                                                 pd->dst.buf, stride);
-  switch (tx_size) {
-    case TX_4X4: {
-      const TX_TYPE tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
-      if (tx_type == DCT_DCT)
-        xd->itxm_add(qcoeff, dst, stride, eob);
+  if (eob > 0) {
+    TX_TYPE tx_type;
+    const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
+                                                         block);
+    uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
+                                                   pd->dst.buf, stride);
+    switch (tx_size) {
+      case TX_4X4:
+        tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
+        if (tx_type == DCT_DCT)
+          xd->itxm_add(dqcoeff, dst, stride, eob);
+        else
+          vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type);
+        break;
+      case TX_8X8:
+        tx_type = get_tx_type_8x8(pd->plane_type, xd);
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+        break;
+      case TX_16X16:
+        tx_type = get_tx_type_16x16(pd->plane_type, xd);
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+        break;
+      case TX_32X32:
+        tx_type = DCT_DCT;
+        vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+        break;
+      default:
+        assert(!"Invalid transform size");
+    }
+
+    if (eob == 1) {
+      vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
+    } else {
+      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
+        vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+      else if (tx_size == TX_32X32 && eob <= 34)
+        vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
       else
-        vp9_iht_add_c(tx_type, qcoeff, dst, stride, eob);
-      break;
+        vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
     }
-    case TX_8X8:
-      vp9_iht_add_8x8_c(get_tx_type_8x8(pd->plane_type, xd), qcoeff, dst,
-                        stride, eob);
-      break;
-    case TX_16X16:
-      vp9_iht_add_16x16_c(get_tx_type_16x16(pd->plane_type, xd), qcoeff, dst,
-                          stride, eob);
-      break;
-    case TX_32X32:
-      vp9_idct_add_32x32(qcoeff, dst, stride, eob);
-      break;
-    default:
-      assert(!"Invalid transform size");
   }
 }
 
-static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                               TX_SIZE tx_size, void *arg) {
-  MACROBLOCKD* const xd = arg;
+struct intra_args {
+  VP9_COMMON *cm;
+  MACROBLOCKD *xd;
+  vp9_reader *r;
+  unsigned char* token_cache;
+};
+
+static void predict_and_reconstruct_intra_block(int plane, int block,
+                                                BLOCK_SIZE plane_bsize,
+                                                TX_SIZE tx_size, void *arg) {
+  struct intra_args *const args = arg;
+  VP9_COMMON *const cm = args->cm;
+  MACROBLOCKD *const xd = args->xd;
+
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  MODE_INFO *const mi = xd->this_mi;
+  MODE_INFO *const mi = xd->mi_8x8[0];
   const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
                                                        block);
   uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
@@ -142,32 +326,37 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                           b_width_log2(plane_bsize), tx_size, mode,
                           dst, pd->dst.stride, dst, pd->dst.stride);
 
-  if (!mi->mbmi.skip_coeff)
-    decode_block(plane, block, plane_bsize, tx_size, arg);
+  if (!mi->mbmi.skip_coeff) {
+    vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size,
+                            args->r, args->token_cache);
+    inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
+  }
 }
 
-static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE bsize, vp9_reader *r) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
-  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
-
-  if (mbmi->skip_coeff) {
-    reset_skip_context(xd, bsize);
-    return -1;
-  } else {
-    if (cm->seg.enabled)
-      setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
-                                                  cm->base_qindex));
-
-    // TODO(dkovalev) if (!vp9_reader_has_error(r))
-    return vp9_decode_tokens(pbi, r, bsize);
-  }
+struct inter_args {
+  VP9_COMMON *cm;
+  MACROBLOCKD *xd;
+  vp9_reader *r;
+  int *eobtotal;
+  unsigned char* token_cache;
+};
+
+static void reconstruct_inter_block(int plane, int block,
+                                    BLOCK_SIZE plane_bsize,
+                                    TX_SIZE tx_size, void *arg) {
+  struct inter_args *args = arg;
+  VP9_COMMON *const cm = args->cm;
+  MACROBLOCKD *const xd = args->xd;
+
+  *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
+                                             plane_bsize, tx_size,
+                                             args->r, args->token_cache);
+  inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
 }
 
-static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize,
-                        int mi_row, int mi_col) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
+static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                        const TileInfo *const tile,
+                        BLOCK_SIZE bsize, int mi_row, int mi_col) {
   const int bh = num_8x8_blocks_high_lookup[bsize];
   const int bw = num_8x8_blocks_wide_lookup[bsize];
   const int offset = mi_row * cm->mode_info_stride + mi_col;
@@ -178,178 +367,187 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize,
   xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset;
 
   // we are using the mode info context stream here
-  xd->this_mi =
-  xd->mi_8x8[0] = xd->mic_stream_ptr;
-  xd->this_mi->mbmi.sb_type = bsize;
-  xd->mic_stream_ptr++;
+  xd->mi_8x8[0] = xd->mi_stream;
+  xd->mi_8x8[0]->mbmi.sb_type = bsize;
+  ++xd->mi_stream;
 
   // Special case: if prev_mi is NULL, the previous mode info context
   // cannot be used.
   xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
 
-  set_skip_context(cm, xd, mi_row, mi_col);
-  set_partition_seg_context(cm, xd, mi_row, mi_col);
+  set_skip_context(xd, xd->above_context, xd->left_context, mi_row, mi_col);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
-  set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
-  setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col);
+  setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col);
 }
 
-static void set_ref(VP9D_COMP *pbi, int i, int mi_row, int mi_col) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
-  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
-  const int ref = mbmi->ref_frame[i] - LAST_FRAME;
-  const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[ref]];
-  const struct scale_factors *sf = &cm->active_ref_scale[ref];
-  if (!vp9_is_valid_scale(sf))
+static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                    int idx, int mi_row, int mi_col) {
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  const int ref = mbmi->ref_frame[idx] - LAST_FRAME;
+  const YV12_BUFFER_CONFIG *cfg = get_frame_ref_buffer(cm, ref);
+  const struct scale_factors_common *sfc = &cm->active_ref_scale_comm[ref];
+  if (!vp9_is_valid_scale(sfc))
     vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                        "Invalid scale factors");
 
-  xd->scale_factor[i] = *sf;
-  setup_pre_planes(xd, i, cfg, mi_row, mi_col, sf);
+  xd->scale_factor[idx].sfc = sfc;
+  setup_pre_planes(xd, idx, cfg, mi_row, mi_col, &xd->scale_factor[idx]);
   xd->corrupted |= cfg->corrupted;
 }
 
-static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
-                           vp9_reader *r, BLOCK_SIZE bsize) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
+static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                           const TileInfo *const tile,
+                           int mi_row, int mi_col,
+                           vp9_reader *r, BLOCK_SIZE bsize,
+                           unsigned char *token_cache) {
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi;
 
-  if (less8x8)
-    if (xd->ab_index > 0)
-      return;
-
-  set_offsets(pbi, bsize, mi_row, mi_col);
-  vp9_read_mode_info(pbi, mi_row, mi_col, r);
+  set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
+  vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
 
   if (less8x8)
     bsize = BLOCK_8X8;
 
   // Has to be called after set_offsets
-  mbmi = &xd->this_mi->mbmi;
+  mbmi = &xd->mi_8x8[0]->mbmi;
 
-  if (!is_inter_block(mbmi)) {
-    // Intra reconstruction
-    decode_tokens(pbi, bsize, r);
-    foreach_transformed_block(xd, bsize, decode_block_intra, xd);
+  if (mbmi->skip_coeff) {
+    reset_skip_context(xd, bsize);
   } else {
-    // Inter reconstruction
-    int eobtotal;
+    if (cm->seg.enabled)
+      setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
+                                                  cm->base_qindex));
+  }
 
-    set_ref(pbi, 0, mi_row, mi_col);
+  if (!is_inter_block(mbmi)) {
+    struct intra_args arg = { cm, xd, r, token_cache };
+    foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
+                              &arg);
+  } else {
+    // Setup
+    set_ref(cm, xd, 0, mi_row, mi_col);
     if (has_second_ref(mbmi))
-      set_ref(pbi, 1, mi_row, mi_col);
+      set_ref(cm, xd, 1, mi_row, mi_col);
 
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+    xd->subpix.filter_x = xd->subpix.filter_y =
+        vp9_get_filter_kernel(mbmi->interp_filter);
+
+    // Prediction
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-    eobtotal = decode_tokens(pbi, bsize, r);
-    if (less8x8) {
-      if (eobtotal >= 0)
-        foreach_transformed_block(xd, bsize, decode_block, xd);
-    } else {
-      assert(mbmi->sb_type == bsize);
-      if (eobtotal == 0)
-        // skip loopfilter
-        vp9_set_pred_flag_mbskip(xd, bsize, 1);
-      else if (eobtotal > 0)
-        foreach_transformed_block(xd, bsize, decode_block, xd);
+
+    // Reconstruction
+    if (!mbmi->skip_coeff) {
+      int eobtotal = 0;
+      struct inter_args arg = { cm, xd, r, &eobtotal, token_cache };
+      foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+      if (!less8x8 && eobtotal == 0)
+        mbmi->skip_coeff = 1;  // skip loopfilter
     }
   }
+
   xd->corrupted |= vp9_reader_has_error(r);
 }
 
-static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
-                            vp9_reader* r, BLOCK_SIZE bsize) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
+static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
+                                     int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                     vp9_reader *r) {
+  const int ctx = partition_plane_context(xd->above_seg_context,
+                                          xd->left_seg_context,
+                                          mi_row, mi_col, bsize);
+  const vp9_prob *const probs = get_partition_probs(cm, ctx);
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+  PARTITION_TYPE p;
+
+  if (has_rows && has_cols)
+    p = treed_read(r, vp9_partition_tree, probs);
+  else if (!has_rows && has_cols)
+    p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+  else if (has_rows && !has_cols)
+    p = vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+  else
+    p = PARTITION_SPLIT;
+
+  if (!cm->frame_parallel_decoding_mode)
+    ++cm->counts.partition[ctx][p];
+
+  return p;
+}
+
+static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                            const TileInfo *const tile,
+                            int mi_row, int mi_col,
+                            vp9_reader* r, BLOCK_SIZE bsize,
+                            unsigned char *token_cache) {
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
-  PARTITION_TYPE partition = PARTITION_NONE;
+  PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  if (bsize < BLOCK_8X8) {
-    if (xd->ab_index != 0)
-      return;
-  } else {
-    int pl;
-    const int idx = check_bsize_coverage(hbs, cm->mi_rows, cm->mi_cols,
-                                         mi_row, mi_col);
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    pl = partition_plane_context(xd, bsize);
-
-    if (idx == 0)
-      partition = treed_read(r, vp9_partition_tree,
-                             cm->fc.partition_prob[cm->frame_type][pl]);
-    else if (idx > 0 &&
-        !vp9_read(r, cm->fc.partition_prob[cm->frame_type][pl][idx]))
-      partition = (idx == 1) ? PARTITION_HORZ : PARTITION_VERT;
-    else
-      partition = PARTITION_SPLIT;
-
-    cm->counts.partition[pl][partition]++;
-  }
-
+  partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
   subsize = get_subsize(bsize, partition);
-  *get_sb_index(xd, subsize) = 0;
-
-  switch (partition) {
-    case PARTITION_NONE:
-      decode_modes_b(pbi, mi_row, mi_col, r, subsize);
-      break;
-    case PARTITION_HORZ:
-      decode_modes_b(pbi, mi_row, mi_col, r, subsize);
-      *get_sb_index(xd, subsize) = 1;
-      if (mi_row + hbs < cm->mi_rows)
-        decode_modes_b(pbi, mi_row + hbs, mi_col, r, subsize);
-      break;
-    case PARTITION_VERT:
-      decode_modes_b(pbi, mi_row, mi_col, r, subsize);
-      *get_sb_index(xd, subsize) = 1;
-      if (mi_col + hbs < cm->mi_cols)
-        decode_modes_b(pbi, mi_row, mi_col + hbs, r, subsize);
-      break;
-    case PARTITION_SPLIT: {
-      int n;
-      for (n = 0; n < 4; n++) {
-        const int j = n >> 1, i = n & 1;
-        *get_sb_index(xd, subsize) = n;
-        decode_modes_sb(pbi, mi_row + j * hbs, mi_col + i * hbs, r, subsize);
-      }
-    } break;
-    default:
-      assert(!"Invalid partition type");
+  if (subsize < BLOCK_8X8) {
+    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        break;
+      case PARTITION_HORZ:
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        if (mi_row + hbs < cm->mi_rows)
+          decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
+                         token_cache);
+        break;
+      case PARTITION_VERT:
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        if (mi_col + hbs < cm->mi_cols)
+          decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
+                         token_cache);
+        break;
+      case PARTITION_SPLIT:
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize,
+                        token_cache);
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
+                        token_cache);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
+                        token_cache);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize,
+                        token_cache);
+        break;
+      default:
+        assert(!"Invalid partition type");
+    }
   }
 
   // update partition context
   if (bsize >= BLOCK_8X8 &&
-      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) {
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    update_partition_context(xd, subsize, bsize);
-  }
+      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
+    update_partition_context(xd->above_seg_context, xd->left_seg_context,
+                             mi_row, mi_col, subsize, bsize);
 }
 
-static void setup_token_decoder(VP9D_COMP *pbi,
-                                const uint8_t *data, size_t read_size,
+static void setup_token_decoder(const uint8_t *data,
+                                const uint8_t *data_end,
+                                size_t read_size,
+                                struct vpx_internal_error_info *error_info,
                                 vp9_reader *r) {
-  VP9_COMMON *cm = &pbi->common;
-  const uint8_t *data_end = pbi->source + pbi->source_sz;
-
   // Validate the calculated partition length. If the buffer
   // described by the partition can't be fully read, then restrict
   // it to the portion that can be (for EC mode) or throw an error.
   if (!read_is_valid(data, read_size, data_end))
-    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+    vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt tile length");
 
   if (vp9_reader_init(r, data, read_size))
-    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+    vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder %d", 1);
 }
 
@@ -364,22 +562,15 @@ static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs,
           for (l = 0; l < PREV_COEF_CONTEXTS; l++)
             if (k > 0 || l < 3)
               for (m = 0; m < UNCONSTRAINED_NODES; m++)
-                if (vp9_read(r, VP9_COEF_UPDATE_PROB))
-                  vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
+                vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
 }
 
 static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
                             vp9_reader *r) {
-  read_coef_probs_common(fc->coef_probs[TX_4X4], r);
-
-  if (tx_mode > ONLY_4X4)
-    read_coef_probs_common(fc->coef_probs[TX_8X8], r);
-
-  if (tx_mode > ALLOW_8X8)
-    read_coef_probs_common(fc->coef_probs[TX_16X16], r);
-
-  if (tx_mode > ALLOW_16X16)
-    read_coef_probs_common(fc->coef_probs[TX_32X32], r);
+    const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+    TX_SIZE tx_size;
+    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+      read_coef_probs_common(fc->coef_probs[tx_size], r);
 }
 
 static void setup_segmentation(struct segmentation *seg,
@@ -436,7 +627,6 @@ static void setup_segmentation(struct segmentation *seg,
 
 static void setup_loopfilter(struct loopfilter *lf,
                              struct vp9_read_bit_buffer *rb) {
-
   lf->filter_level = vp9_rb_read_literal(rb, 6);
   lf->sharpness_level = vp9_rb_read_literal(rb, 3);
 
@@ -467,9 +657,8 @@ static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
   return old != *delta_q;
 }
 
-static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
-  MACROBLOCKD *const xd = &pbi->mb;
-  VP9_COMMON *const cm = &pbi->common;
+static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                               struct vp9_read_bit_buffer *rb) {
   int update = 0;
 
   cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS);
@@ -484,16 +673,15 @@ static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
 
-  xd->itxm_add = xd->lossless ? vp9_idct_add_lossless_c
-                              : vp9_idct_add;
+  xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 }
 
-static INTERPOLATIONFILTERTYPE read_interp_filter_type(
-    struct vp9_read_bit_buffer *rb) {
-  const INTERPOLATIONFILTERTYPE literal_to_type[] = { EIGHTTAP_SMOOTH,
-                                                      EIGHTTAP,
-                                                      EIGHTTAP_SHARP,
-                                                      BILINEAR };
+static INTERPOLATION_TYPE read_interp_filter_type(
+                              struct vp9_read_bit_buffer *rb) {
+  const INTERPOLATION_TYPE literal_to_type[] = { EIGHTTAP_SMOOTH,
+                                                 EIGHTTAP,
+                                                 EIGHTTAP_SHARP,
+                                                 BILINEAR };
   return vp9_rb_read_bit(rb) ? SWITCHABLE
                              : literal_to_type[vp9_rb_read_literal(rb, 2)];
 }
@@ -539,7 +727,7 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {
     vp9_update_frame_size(cm);
   }
 
-  vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx], cm->width, cm->height,
+  vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
                            cm->subsampling_x, cm->subsampling_y,
                            VP9BORDERINPIXELS);
 }
@@ -560,7 +748,7 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
   int found = 0, i;
   for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
     if (vp9_rb_read_bit(rb)) {
-      YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[i]];
+      YV12_BUFFER_CONFIG *const cfg = get_frame_ref_buffer(cm, i);
       width = cfg->y_crop_width;
       height = cfg->y_crop_height;
       found = 1;
@@ -579,67 +767,73 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
   setup_display_size(cm, rb);
 }
 
-static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
+static void setup_tile_context(VP9D_COMP *const pbi, MACROBLOCKD *const xd,
+                               int tile_col) {
+  int i;
+  xd->mi_stream = pbi->mi_streams[tile_col];
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    xd->above_context[i] = pbi->above_context[i];
+  }
+  // see note in alloc_tile_storage().
+  xd->above_seg_context = pbi->above_seg_context;
+}
+
+static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile,
+                        vp9_reader *r) {
   const int num_threads = pbi->oxcf.max_threads;
   VP9_COMMON *const cm = &pbi->common;
   int mi_row, mi_col;
-  YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[cm->new_fb_idx];
+  MACROBLOCKD *xd = &pbi->mb;
 
   if (pbi->do_loopfilter_inline) {
-    if (num_threads > 1) {
-      LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
-      lf_data->frame_buffer = fb;
-      lf_data->cm = cm;
-      lf_data->xd = pbi->mb;
-      lf_data->stop = 0;
-      lf_data->y_only = 0;
-    }
+    LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+    lf_data->frame_buffer = get_frame_new_buffer(cm);
+    lf_data->cm = cm;
+    lf_data->xd = pbi->mb;
+    lf_data->stop = 0;
+    lf_data->y_only = 0;
     vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
   }
 
-  for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
+  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
     // For a SB there are 2 left contexts, each pertaining to a MB row within
-    vp9_zero(cm->left_context);
-    vp9_zero(cm->left_seg_context);
-    for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+    vp9_zero(xd->left_context);
+    vp9_zero(xd->left_seg_context);
+    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
-      decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_64X64);
+      decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64,
+                      pbi->token_cache);
 
     if (pbi->do_loopfilter_inline) {
-      // delay the loopfilter by 1 macroblock row.
       const int lf_start = mi_row - MI_BLOCK_SIZE;
-      if (lf_start < 0) continue;
+      LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
 
-      if (num_threads > 1) {
-        LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+      // delay the loopfilter by 1 macroblock row.
+      if (lf_start < 0) continue;
 
-        // decoding has completed: finish up the loop filter in this thread.
-        if (mi_row + MI_BLOCK_SIZE >= cm->cur_tile_mi_row_end) continue;
+      // decoding has completed: finish up the loop filter in this thread.
+      if (mi_row + MI_BLOCK_SIZE >= tile->mi_row_end) continue;
 
-        vp9_worker_sync(&pbi->lf_worker);
-        lf_data->start = lf_start;
-        lf_data->stop = mi_row;
-        pbi->lf_worker.hook = vp9_loop_filter_worker;
+      vp9_worker_sync(&pbi->lf_worker);
+      lf_data->start = lf_start;
+      lf_data->stop = mi_row;
+      if (num_threads > 1) {
         vp9_worker_launch(&pbi->lf_worker);
       } else {
-        vp9_loop_filter_rows(fb, cm, &pbi->mb, lf_start, mi_row, 0);
+        vp9_worker_execute(&pbi->lf_worker);
       }
     }
   }
 
   if (pbi->do_loopfilter_inline) {
-    int lf_start;
-    if (num_threads > 1) {
-      LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+    LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
 
-      vp9_worker_sync(&pbi->lf_worker);
-      lf_start = lf_data->stop;
-    } else {
-      lf_start = mi_row - MI_BLOCK_SIZE;
-    }
-    vp9_loop_filter_rows(fb, cm, &pbi->mb,
-                         lf_start, cm->mi_rows, 0);
+    vp9_worker_sync(&pbi->lf_worker);
+    lf_data->start = lf_data->stop;
+    lf_data->stop = cm->mi_rows;
+    vp9_worker_execute(&pbi->lf_worker);
   }
 }
 
@@ -659,10 +853,32 @@ static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
     cm->log2_tile_rows += vp9_rb_read_bit(rb);
 }
 
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static size_t get_tile(const uint8_t *const data_end,
+                       int is_last,
+                       struct vpx_internal_error_info *error_info,
+                       const uint8_t **data) {
+  size_t size;
+
+  if (!is_last) {
+    if (!read_is_valid(*data, 4, data_end))
+      vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+          "Truncated packet or corrupt tile length");
+
+    size = read_be32(*data);
+    *data += 4;
+  } else {
+    size = data_end - *data;
+  }
+  return size;
+}
+
 static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
   vp9_reader residual_bc;
 
   VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
 
   const uint8_t *const data_end = pbi->source + pbi->source_sz;
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
@@ -672,70 +888,57 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(cm->above_context[0], 0,
-             sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE * (2 * aligned_mi_cols));
+  vpx_memset(pbi->above_context[0], 0,
+             sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+             2 * aligned_mi_cols);
 
-  vpx_memset(cm->above_seg_context, 0,
-             sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
+  vpx_memset(pbi->above_seg_context, 0,
+             sizeof(*pbi->above_seg_context) * aligned_mi_cols);
 
   if (pbi->oxcf.inv_tile_order) {
     const uint8_t *data_ptr2[4][1 << 6];
     vp9_reader bc_bak = {0};
 
-    // pre-initialize the offsets, we're going to read in inverse order
+    // pre-initialize the offsets, we're going to decode in inverse order
     data_ptr2[0][0] = data;
     for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      if (tile_row) {
-        const int size = read_be32(data_ptr2[tile_row - 1][tile_cols - 1]);
-        data_ptr2[tile_row - 1][tile_cols - 1] += 4;
-        data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][tile_cols - 1] + size;
-      }
-
-      for (tile_col = 1; tile_col < tile_cols; tile_col++) {
-        const int size = read_be32(data_ptr2[tile_row][tile_col - 1]);
-        data_ptr2[tile_row][tile_col - 1] += 4;
-        data_ptr2[tile_row][tile_col] =
-            data_ptr2[tile_row][tile_col - 1] + size;
+      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+        const int last_tile =
+            tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
+        const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
+        data_ptr2[tile_row][tile_col] = data;
+        data += size;
       }
     }
 
     for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      vp9_get_tile_row_offsets(cm, tile_row);
       for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) {
-        vp9_get_tile_col_offsets(cm, tile_col);
-        setup_token_decoder(pbi, data_ptr2[tile_row][tile_col],
+        TileInfo tile;
+
+        vp9_tile_init(&tile, cm, tile_row, tile_col);
+        setup_token_decoder(data_ptr2[tile_row][tile_col], data_end,
                             data_end - data_ptr2[tile_row][tile_col],
-                            &residual_bc);
-        decode_tile(pbi, &residual_bc);
+                            &cm->error, &residual_bc);
+        setup_tile_context(pbi, xd, tile_col);
+        decode_tile(pbi, &tile, &residual_bc);
         if (tile_row == tile_rows - 1 && tile_col == tile_cols - 1)
           bc_bak = residual_bc;
       }
     }
     residual_bc = bc_bak;
   } else {
-    int has_more;
-
     for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      vp9_get_tile_row_offsets(cm, tile_row);
       for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-        size_t size;
-
-        vp9_get_tile_col_offsets(cm, tile_col);
+        const int last_tile =
+            tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
+        const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
+        TileInfo tile;
 
-        has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1;
-        if (has_more) {
-          if (!read_is_valid(data, 4, data_end))
-            vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Truncated packet or corrupt tile length");
+        vp9_tile_init(&tile, cm, tile_row, tile_col);
 
-          size = read_be32(data);
-          data += 4;
-        } else {
-          size = data_end - data;
-        }
-
-        setup_token_decoder(pbi, data, size, &residual_bc);
-        decode_tile(pbi, &residual_bc);
+        setup_token_decoder(data, data_end, size, &cm->error, &residual_bc);
+        setup_tile_context(pbi, xd, tile_col);
+        decode_tile(pbi, &tile, &residual_bc);
         data += size;
       }
     }
@@ -744,10 +947,113 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
   return vp9_reader_find_end(&residual_bc);
 }
 
+static int tile_worker_hook(void *arg1, void *arg2) {
+  TileWorkerData *tile_data = (TileWorkerData*)arg1;
+  const TileInfo *const tile = (TileInfo*)arg2;
+  int mi_row, mi_col;
+
+  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+       mi_row += MI_BLOCK_SIZE) {
+    vp9_zero(tile_data->xd.left_context);
+    vp9_zero(tile_data->xd.left_seg_context);
+    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+         mi_col += MI_BLOCK_SIZE) {
+      decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
+                      mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64,
+                      tile_data->token_cache);
+    }
+  }
+  return !tile_data->xd.corrupted;
+}
+
+static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) {
+  VP9_COMMON *const cm = &pbi->common;
+  const uint8_t *const data_end = pbi->source + pbi->source_sz;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols);
+  int tile_col = 0;
+
+  assert(tile_rows == 1);
+  (void)tile_rows;
+
+  if (num_workers > pbi->num_tile_workers) {
+    int i;
+    CHECK_MEM_ERROR(cm, pbi->tile_workers,
+                    vpx_realloc(pbi->tile_workers,
+                                num_workers * sizeof(*pbi->tile_workers)));
+    for (i = pbi->num_tile_workers; i < num_workers; ++i) {
+      VP9Worker *const worker = &pbi->tile_workers[i];
+      ++pbi->num_tile_workers;
+
+      vp9_worker_init(worker);
+      worker->hook = (VP9WorkerHook)tile_worker_hook;
+      CHECK_MEM_ERROR(cm, worker->data1,
+                      vpx_memalign(32, sizeof(TileWorkerData)));
+      CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
+      if (i < num_workers - 1 && !vp9_worker_reset(worker)) {
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "Tile decoder thread creation failed");
+      }
+    }
+  }
+
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  vpx_memset(pbi->above_context[0], 0,
+             sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+             2 * aligned_mi_cols);
+  vpx_memset(pbi->above_seg_context, 0,
+             sizeof(*pbi->above_seg_context) * aligned_mi_cols);
+
+  while (tile_col < tile_cols) {
+    int i;
+    for (i = 0; i < num_workers && tile_col < tile_cols; ++i) {
+      VP9Worker *const worker = &pbi->tile_workers[i];
+      TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
+      TileInfo *const tile = (TileInfo*)worker->data2;
+      const size_t size =
+          get_tile(data_end, tile_col == tile_cols - 1, &cm->error, &data);
+
+      tile_data->cm = cm;
+      tile_data->xd = pbi->mb;
+      tile_data->xd.corrupted = 0;
+      vp9_tile_init(tile, tile_data->cm, 0, tile_col);
+
+      setup_token_decoder(data, data_end, size, &cm->error,
+                          &tile_data->bit_reader);
+      setup_tile_context(pbi, &tile_data->xd, tile_col);
+
+      worker->had_error = 0;
+      if (i == num_workers - 1 || tile_col == tile_cols - 1) {
+        vp9_worker_execute(worker);
+      } else {
+        vp9_worker_launch(worker);
+      }
+
+      data += size;
+      ++tile_col;
+    }
+
+    for (; i > 0; --i) {
+      VP9Worker *const worker = &pbi->tile_workers[i - 1];
+      pbi->mb.corrupted |= !vp9_worker_sync(worker);
+    }
+  }
+
+  {
+    const int final_worker = (tile_cols + num_workers - 1) % num_workers;
+    TileWorkerData *const tile_data =
+        (TileWorkerData*)pbi->tile_workers[final_worker].data1;
+    return vp9_reader_find_end(&tile_data->bit_reader);
+  }
+}
+
 static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
-  if (vp9_rb_read_literal(rb, 8) != SYNC_CODE_0 ||
-      vp9_rb_read_literal(rb, 8) != SYNC_CODE_1 ||
-      vp9_rb_read_literal(rb, 8) != SYNC_CODE_2) {
+  if (vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_0 ||
+      vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_1 ||
+      vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_2) {
     vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                        "Invalid frame sync code");
   }
@@ -758,34 +1064,6 @@ static void error_handler(void *data, size_t bit_offset) {
   vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
 
-static void setup_inter_inter(VP9_COMMON *cm) {
-  int i;
-
-  cm->allow_comp_inter_inter = 0;
-  for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i)
-    cm->allow_comp_inter_inter |=
-        cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1];
-
-  if (cm->allow_comp_inter_inter) {
-    // which one is always-on in comp inter-inter?
-    if (cm->ref_frame_sign_bias[LAST_FRAME] ==
-        cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
-      cm->comp_fixed_ref = ALTREF_FRAME;
-      cm->comp_var_ref[0] = LAST_FRAME;
-      cm->comp_var_ref[1] = GOLDEN_FRAME;
-    } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
-               cm->ref_frame_sign_bias[ALTREF_FRAME]) {
-      cm->comp_fixed_ref = GOLDEN_FRAME;
-      cm->comp_var_ref[0] = LAST_FRAME;
-      cm->comp_var_ref[1] = ALTREF_FRAME;
-    } else {
-      cm->comp_fixed_ref = LAST_FRAME;
-      cm->comp_var_ref[0] = GOLDEN_FRAME;
-      cm->comp_var_ref[1] = ALTREF_FRAME;
-    }
-  }
-}
-
 #define RESERVED \
   if (vp9_rb_read_bit(rb)) \
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, \
@@ -794,12 +1072,12 @@ static void setup_inter_inter(VP9_COMMON *cm) {
 static size_t read_uncompressed_header(VP9D_COMP *pbi,
                                        struct vp9_read_bit_buffer *rb) {
   VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
+  size_t sz;
   int i;
 
   cm->last_frame_type = cm->frame_type;
 
-  if (vp9_rb_read_literal(rb, 2) != 0x2)
+  if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Invalid frame marker");
 
@@ -820,12 +1098,10 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
   cm->error_resilient_mode = vp9_rb_read_bit(rb);
 
   if (cm->frame_type == KEY_FRAME) {
-    int csp;
-
     check_sync_code(cm, rb);
 
-    csp = vp9_rb_read_literal(rb, 3);  // colorspace
-    if (csp != 7) {  // != sRGB
+    cm->color_space = vp9_rb_read_literal(rb, 3);  // colorspace
+    if (cm->color_space != SRGB) {
       vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
       if (cm->version == 1) {
         cm->subsampling_x = vp9_rb_read_bit(rb);
@@ -872,13 +1148,11 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
 
       setup_frame_size_with_refs(pbi, rb);
 
-      xd->allow_high_precision_mv = vp9_rb_read_bit(rb);
+      cm->allow_high_precision_mv = vp9_rb_read_bit(rb);
       cm->mcomp_filter_type = read_interp_filter_type(rb);
 
       for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
         vp9_setup_scale_factors(cm, i);
-
-      setup_inter_inter(cm);
     }
   }
 
@@ -890,25 +1164,34 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
     cm->frame_parallel_decoding_mode = 1;
   }
 
+  // This flag will be overridden by the call to vp9_setup_past_independence
+  // below, forcing the use of context 0 for those frame types.
   cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LOG2);
 
-  if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->intra_only)
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode)
     vp9_setup_past_independence(cm);
 
   setup_loopfilter(&cm->lf, rb);
-  setup_quantization(pbi, rb);
+  setup_quantization(cm, &pbi->mb, rb);
   setup_segmentation(&cm->seg, rb);
 
   setup_tile_info(cm, rb);
+  sz = vp9_rb_read_literal(rb, 16);
+
+  if (sz == 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid header size");
 
-  return vp9_rb_read_literal(rb, 16);
+  return sz;
 }
 
 static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
                                   size_t partition_size) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
+  FRAME_CONTEXT *const fc = &cm->fc;
   vp9_reader r;
+  int k;
 
   if (vp9_reader_init(&r, data, partition_size))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
@@ -916,10 +1199,36 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
 
   cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r);
   if (cm->tx_mode == TX_MODE_SELECT)
-    read_tx_probs(&cm->fc.tx_probs, &r);
-  read_coef_probs(&cm->fc, cm->tx_mode, &r);
+    read_tx_probs(&fc->tx_probs, &r);
+  read_coef_probs(fc, cm->tx_mode, &r);
+
+  for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+    vp9_diff_update_prob(&r, &fc->mbskip_probs[k]);
+
+  if (!frame_is_intra_only(cm)) {
+    nmv_context *const nmvc = &fc->nmvc;
+    int i, j;
+
+    read_inter_mode_probs(fc, &r);
+
+    if (cm->mcomp_filter_type == SWITCHABLE)
+      read_switchable_interp_probs(fc, &r);
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+      vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]);
+
+    read_comp_pred(cm, &r);
+
+    for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
+      for (i = 0; i < INTRA_MODES - 1; ++i)
+        vp9_diff_update_prob(&r, &fc->y_mode_prob[j][i]);
+
+    for (j = 0; j < PARTITION_CONTEXTS; ++j)
+      for (i = 0; i < PARTITION_TYPES - 1; ++i)
+        vp9_diff_update_prob(&r, &fc->partition_prob[j][i]);
 
-  vp9_prepare_read_mode_info(pbi, &r);
+    read_mv_probs(&r, nmvc, cm->allow_high_precision_mv);
+  }
 
   return vp9_reader_has_error(&r);
 }
@@ -936,59 +1245,109 @@ void vp9_init_dequantizer(VP9_COMMON *cm) {
   }
 }
 
+#ifdef NDEBUG
+#define debug_check_frame_counts(cm) (void)0
+#else  // !NDEBUG
+// Counts should only be incremented when frame_parallel_decoding_mode and
+// error_resilient_mode are disabled.
+static void debug_check_frame_counts(const VP9_COMMON *const cm) {
+  FRAME_COUNTS zero_counts;
+  vp9_zero(zero_counts);
+  assert(cm->frame_parallel_decoding_mode || cm->error_resilient_mode);
+  assert(!memcmp(cm->counts.y_mode, zero_counts.y_mode,
+                 sizeof(cm->counts.y_mode)));
+  assert(!memcmp(cm->counts.uv_mode, zero_counts.uv_mode,
+                 sizeof(cm->counts.uv_mode)));
+  assert(!memcmp(cm->counts.partition, zero_counts.partition,
+                 sizeof(cm->counts.partition)));
+  assert(!memcmp(cm->counts.coef, zero_counts.coef,
+                 sizeof(cm->counts.coef)));
+  assert(!memcmp(cm->counts.eob_branch, zero_counts.eob_branch,
+                 sizeof(cm->counts.eob_branch)));
+  assert(!memcmp(cm->counts.switchable_interp, zero_counts.switchable_interp,
+                 sizeof(cm->counts.switchable_interp)));
+  assert(!memcmp(cm->counts.inter_mode, zero_counts.inter_mode,
+                 sizeof(cm->counts.inter_mode)));
+  assert(!memcmp(cm->counts.intra_inter, zero_counts.intra_inter,
+                 sizeof(cm->counts.intra_inter)));
+  assert(!memcmp(cm->counts.comp_inter, zero_counts.comp_inter,
+                 sizeof(cm->counts.comp_inter)));
+  assert(!memcmp(cm->counts.single_ref, zero_counts.single_ref,
+                 sizeof(cm->counts.single_ref)));
+  assert(!memcmp(cm->counts.comp_ref, zero_counts.comp_ref,
+                 sizeof(cm->counts.comp_ref)));
+  assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx)));
+  assert(!memcmp(cm->counts.mbskip, zero_counts.mbskip,
+                 sizeof(cm->counts.mbskip)));
+  assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv)));
+}
+#endif  // NDEBUG
+
 int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   int i;
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
 
   const uint8_t *data = pbi->source;
-  const uint8_t *data_end = pbi->source + pbi->source_sz;
+  const uint8_t *const data_end = pbi->source + pbi->source_sz;
 
-  struct vp9_read_bit_buffer rb = { data, data_end, 0,
-                                    cm, error_handler };
+  struct vp9_read_bit_buffer rb = { data, data_end, 0, cm, error_handler };
   const size_t first_partition_size = read_uncompressed_header(pbi, &rb);
   const int keyframe = cm->frame_type == KEY_FRAME;
-  YV12_BUFFER_CONFIG *new_fb = &cm->yv12_fb[cm->new_fb_idx];
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
 
   if (!first_partition_size) {
-    // showing a frame directly
-    *p_data_end = data + 1;
-    return 0;
+      // showing a frame directly
+      *p_data_end = data + 1;
+      return 0;
   }
-  data += vp9_rb_bytes_read(&rb);
-  xd->corrupted = 0;
-  new_fb->corrupted = 0;
-  pbi->do_loopfilter_inline =
-      (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
 
   if (!pbi->decoded_key_frame && !keyframe)
     return -1;
 
+  data += vp9_rb_bytes_read(&rb);
   if (!read_is_valid(data, first_partition_size, data_end))
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt header length");
 
-  setup_plane_dequants(cm, &pbi->mb, cm->base_qindex);
+  pbi->do_loopfilter_inline =
+      (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
+  if (pbi->do_loopfilter_inline && pbi->lf_worker.data1 == NULL) {
+    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, vpx_malloc(sizeof(LFWorkerData)));
+    pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
+    if (pbi->oxcf.max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Loop filter thread creation failed");
+    }
+  }
+
+  alloc_tile_storage(pbi, tile_cols);
 
   xd->mi_8x8 = cm->mi_grid_visible;
-  xd->mic_stream_ptr = cm->mi;
   xd->mode_info_stride = cm->mode_info_stride;
+  set_prev_mi(cm);
 
-  cm->fc = cm->frame_contexts[cm->frame_context_idx];
-
-  vp9_zero(cm->counts);
-
-  new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size);
-
+  setup_plane_dequants(cm, xd, cm->base_qindex);
   setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y);
 
-  // clear out the coeff buffer
+  cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  vp9_zero(cm->counts);
   for (i = 0; i < MAX_MB_PLANE; ++i)
-    vp9_zero(xd->plane[i].qcoeff);
+    vp9_zero(xd->plane[i].dqcoeff);
 
-  set_prev_mi(cm);
+  xd->corrupted = 0;
+  new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
 
-  *p_data_end = decode_tiles(pbi, data + first_partition_size);
+  // TODO(jzern): remove frame_parallel_decoding_mode restriction for
+  // single-frame tile decoding.
+  if (pbi->oxcf.max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
+      cm->frame_parallel_decoding_mode) {
+    *p_data_end = decode_tiles_mt(pbi, data + first_partition_size);
+  } else {
+    *p_data_end = decode_tiles(pbi, data + first_partition_size);
+  }
 
   cm->last_width = cm->width;
   cm->last_height = cm->height;
@@ -1006,10 +1365,12 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(cm);
 
-    if (!keyframe && !cm->intra_only) {
+    if (!frame_is_intra_only(cm)) {
       vp9_adapt_mode_probs(cm);
-      vp9_adapt_mv_probs(cm, xd->allow_high_precision_mv);
+      vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
     }
+  } else {
+    debug_check_frame_counts(cm);
   }
 
   if (cm->refresh_frame_context)
diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c
index cd74a0b..b8d670b 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/libvpx/vp9/decoder/vp9_detokenize.c
@@ -61,53 +61,55 @@ static const vp9_prob cat6_prob[15] = {
   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
 };
 
-DECLARE_ALIGNED(16, extern const uint8_t,
-                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-#define INCREMENT_COUNT(token)               \
-  do {                                       \
-    coef_counts[type][ref][band][pt]         \
-               [token >= TWO_TOKEN ?     \
-                (token == DCT_EOB_TOKEN ? DCT_EOB_MODEL_TOKEN : TWO_TOKEN) : \
-                token]++;     \
-    token_cache[scan[c]] = vp9_pt_energy_class[token]; \
-  } while (0)
+static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = {
+  ZERO_TOKEN, ONE_TOKEN, TWO_TOKEN, TWO_TOKEN,
+  TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, TWO_TOKEN,
+  TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, DCT_EOB_MODEL_TOKEN
+};
+
+#define INCREMENT_COUNT(token)                           \
+  do {                                                   \
+     if (!cm->frame_parallel_decoding_mode) {            \
+       ++coef_counts[band][pt][token_to_counttoken[token]]; \
+     }                                                   \
+  } while (0);
 
 #define WRITE_COEF_CONTINUE(val, token)                  \
   {                                                      \
-    qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
+    dqcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
                             dq[c > 0] / (1 + (tx_size == TX_32X32)); \
     INCREMENT_COUNT(token);                              \
+    token_cache[scan[c]] = vp9_pt_energy_class[token];   \
     c++;                                                 \
     continue;                                            \
   }
 
-#define ADJUST_COEF(prob, bits_count)  \
-  do {                                 \
-    if (vp9_read(r, prob))             \
-      val += 1 << bits_count;          \
+#define ADJUST_COEF(prob, bits_count)                   \
+  do {                                                  \
+    val += (vp9_read(r, prob) << bits_count);           \
   } while (0);
 
 static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
                         vp9_reader *r, int block_idx,
-                        PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr,
-                        TX_SIZE tx_size, const int16_t *dq,
-                        ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
-  FRAME_CONTEXT *const fc = &cm->fc;
+                        PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr,
+                        TX_SIZE tx_size, const int16_t *dq, int pt,
+                        uint8_t *token_cache) {
+  const FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
-  const int ref = is_inter_block(&xd->this_mi->mbmi);
+  const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
   int band, c = 0;
-  vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] =
+  const vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] =
       fc->coef_probs[tx_size][type][ref];
   vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
   uint8_t load_map[COEF_BANDS][PREV_COEF_CONTEXTS] = { { 0 } };
-  vp9_prob *prob;
-  vp9_coeff_count_model *coef_counts = counts->coef[tx_size];
+  const vp9_prob *prob;
+  unsigned int (*coef_counts)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES + 1] =
+      counts->coef[tx_size][type][ref];
+  unsigned int (*eob_branch_count)[PREV_COEF_CONTEXTS] =
+      counts->eob_branch[tx_size][type][ref];
   const int16_t *scan, *nb;
-  const uint8_t *band_translate;
-  uint8_t token_cache[1024];
-  int pt = get_entropy_context(xd, tx_size, type, block_idx, A, L,
-                               &scan, &band_translate);
-  nb = vp9_get_coef_neighbors_handle(scan);
+  const uint8_t *const band_translate = get_band_translate(tx_size);
+  get_scan(xd, tx_size, type, block_idx, &scan, &nb);
 
   while (1) {
     int val;
@@ -118,11 +120,12 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
       pt = get_coef_context(nb, token_cache, c);
     band = get_coef_band(band_translate, c);
     prob = coef_probs[band][pt];
-    counts->eob_branch[tx_size][type][ref][band][pt]++;
+    if (!cm->frame_parallel_decoding_mode)
+      ++eob_branch_count[band][pt];
     if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
       break;
 
-SKIP_START:
+  SKIP_START:
     if (c >= seg_eob)
       break;
     if (c)
@@ -132,6 +135,7 @@ SKIP_START:
 
     if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
+      token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN];
       ++c;
       goto SKIP_START;
     }
@@ -203,47 +207,34 @@ SKIP_START:
     WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);
   }
 
-  if (c < seg_eob)
-    coef_counts[type][ref][band][pt][DCT_EOB_MODEL_TOKEN]++;
-
+  if (c < seg_eob) {
+    if (!cm->frame_parallel_decoding_mode)
+      ++coef_counts[band][pt][DCT_EOB_MODEL_TOKEN];
+  }
 
   return c;
 }
 
-struct decode_block_args {
-  VP9D_COMP *pbi;
-  vp9_reader *r;
-  int *eobtotal;
-};
-
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
-                         TX_SIZE tx_size, void *argv) {
-  const struct decode_block_args* const arg = argv;
-
-  // find the maximum eob for this transform size, adjusted by segment
-  MACROBLOCKD *xd = &arg->pbi->mb;
-  struct segmentation *seg = &arg->pbi->common.seg;
-  struct macroblockd_plane* pd = &xd->plane[plane];
-  const int segment_id = xd->this_mi->mbmi.segment_id;
-  const int seg_eob = get_tx_eob(seg, segment_id, tx_size);
-  int aoff, loff, eob;
-
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+                            int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, vp9_reader *r,
+                            uint8_t *token_cache) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
+                                 tx_size);
+  int aoff, loff, eob, pt;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
+  pt = get_entropy_context(tx_size, pd->above_context + aoff,
+                                    pd->left_context + loff);
 
-  eob = decode_coefs(&arg->pbi->common, xd, arg->r, block,
-                     pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block),
-                     tx_size, pd->dequant,
-                     pd->above_context + aoff, pd->left_context + loff);
+  eob = decode_coefs(cm, xd, r, block,
+                     pd->plane_type, seg_eob, BLOCK_OFFSET(pd->dqcoeff, block),
+                     tx_size, pd->dequant, pt, token_cache);
 
   set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
 
   pd->eobs[block] = eob;
-  *arg->eobtotal += eob;
+  return eob;
 }
 
-int vp9_decode_tokens(VP9D_COMP *pbi, vp9_reader *r, BLOCK_SIZE bsize) {
-  int eobtotal = 0;
-  struct decode_block_args args = {pbi, r, &eobtotal};
-  foreach_transformed_block(&pbi->mb, bsize, decode_block, &args);
-  return eobtotal;
-}
+
diff --git a/libvpx/vp9/decoder/vp9_detokenize.h b/libvpx/vp9/decoder/vp9_detokenize.h
index cf07c56..04939ea 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.h
+++ b/libvpx/vp9/decoder/vp9_detokenize.h
@@ -15,6 +15,9 @@
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_dboolhuff.h"
 
-int vp9_decode_tokens(VP9D_COMP* pbi, vp9_reader *r, BLOCK_SIZE bsize);
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+                            int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, vp9_reader *r,
+                            uint8_t *token_cache);
 
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/libvpx/vp9/decoder/vp9_dsubexp.c b/libvpx/vp9/decoder/vp9_dsubexp.c
index 8cc64f7..fcca017 100644
--- a/libvpx/vp9/decoder/vp9_dsubexp.c
+++ b/libvpx/vp9/decoder/vp9_dsubexp.c
@@ -48,8 +48,6 @@ static int merge_index(int v, int n, int modulus) {
 
 static int inv_remap_prob(int v, int m) {
   static int inv_map_table[MAX_PROB - 1] = {
-    // generated by:
-    //   inv_map_table[j] = merge_index(j, MAX_PROB - 1, MODULUS_PARAM);
       6,  19,  32,  45,  58,  71,  84,  97, 110, 123, 136, 149, 162, 175, 188,
     201, 214, 227, 240, 253,   0,   1,   2,   3,   4,   5,   7,   8,   9,  10,
      11,  12,  13,  14,  15,  16,  17,  18,  20,  21,  22,  23,  24,  25,  26,
@@ -66,10 +64,11 @@ static int inv_remap_prob(int v, int m) {
     190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205,
     206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221,
     222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
-    238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
-
+    238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252
   };
-  // v = merge_index(v, MAX_PROBS - 1, MODULUS_PARAM);
+  // The clamp is not necessary for conforming VP9 stream, it is added to
+  // prevent out of bound access for bad input data
+  v = clamp(v, 0, 253);
   v = inv_map_table[v];
   m--;
   if ((m << 1) <= MAX_PROB) {
@@ -101,6 +100,8 @@ static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {
 }
 
 void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) {
-  int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
-  *p = (vp9_prob)inv_remap_prob(delp, *p);
+  if (vp9_read(r, DIFF_UPDATE_PROB)) {
+    const int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
+    *p = (vp9_prob)inv_remap_prob(delp, *p);
+  }
 }
diff --git a/libvpx/vp9/decoder/vp9_idct_blk.c b/libvpx/vp9/decoder/vp9_idct_blk.c
deleted file mode 100644
index 395e636..0000000
--- a/libvpx/vp9/decoder/vp9_idct_blk.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9_rtcd.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/decoder/vp9_idct_blk.h"
-
-static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
-                                  int width, int height) {
-  int r, c;
-
-  for (r = 0; r < height; r++) {
-    for (c = 0; c < width; c++)
-      dest[c] = clip_pixel(diff + dest[c]);
-
-    dest += stride;
-  }
-}
-
-void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest,
-                                     int stride) {
-  add_constant_residual(diff, dest, stride, 8, 8);
-}
-
-void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest,
-                                       int stride) {
-  add_constant_residual(diff, dest, stride, 16, 16);
-}
-
-void vp9_add_constant_residual_32x32_c(const int16_t diff,  uint8_t *dest,
-                                       int stride) {
-  add_constant_residual(diff, dest, stride, 32, 32);
-}
-
-void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
-                   int eob) {
-  if (tx_type == DCT_DCT) {
-    vp9_idct_add(input, dest, stride, eob);
-  } else {
-    vp9_short_iht4x4_add(input, dest, stride, tx_type);
-    vpx_memset(input, 0, 32);
-  }
-}
-
-void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                       int stride, int eob) {
-  if (tx_type == DCT_DCT) {
-    vp9_idct_add_8x8(input, dest, stride, eob);
-  } else {
-    if (eob > 0) {
-      vp9_short_iht8x8_add(input, dest, stride, tx_type);
-      vpx_memset(input, 0, 128);
-    }
-  }
-}
-
-void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  if (eob > 1) {
-    vp9_short_idct4x4_add(input, dest, stride);
-    vpx_memset(input, 0, 32);
-  } else {
-    vp9_short_idct4x4_1_add(input, dest, stride);
-    ((int *)input)[0] = 0;
-  }
-}
-
-void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,
-                             int eob) {
-  if (eob > 1) {
-    vp9_short_iwalsh4x4_add(input, dest, stride);
-    vpx_memset(input, 0, 32);
-  } else {
-    vp9_short_iwalsh4x4_1_add_c(input, dest, stride);
-    ((int *)input)[0] = 0;
-  }
-}
-
-void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  // If dc is 1, then input[0] is the reconstructed value, do not need
-  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
-
-  // The calculation can be simplified if there are not many non-zero dct
-  // coefficients. Use eobs to decide what to do.
-  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
-  // Combine that with code here.
-  if (eob) {
-    if (eob == 1) {
-      // DC only DCT coefficient
-      vp9_short_idct8x8_1_add(input, dest, stride);
-      input[0] = 0;
-    } else if (eob <= 10) {
-      vp9_short_idct10_8x8_add(input, dest, stride);
-      vpx_memset(input, 0, 128);
-    } else {
-      vp9_short_idct8x8_add(input, dest, stride);
-      vpx_memset(input, 0, 128);
-    }
-  }
-}
-
-void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                         int stride, int eob) {
-  if (tx_type == DCT_DCT) {
-    vp9_idct_add_16x16(input, dest, stride, eob);
-  } else {
-    if (eob > 0) {
-      vp9_short_iht16x16_add(input, dest, stride, tx_type);
-      vpx_memset(input, 0, 512);
-    }
-  }
-}
-
-void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  /* The calculation can be simplified if there are not many non-zero dct
-   * coefficients. Use eobs to separate different cases. */
-  if (eob) {
-    if (eob == 1) {
-      /* DC only DCT coefficient. */
-      vp9_short_idct16x16_1_add(input, dest, stride);
-      input[0] = 0;
-    } else if (eob <= 10) {
-      vp9_short_idct10_16x16_add(input, dest, stride);
-      vpx_memset(input, 0, 512);
-    } else {
-      vp9_short_idct16x16_add(input, dest, stride);
-      vpx_memset(input, 0, 512);
-    }
-  }
-}
-
-void vp9_idct_add_32x32_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 1024);
-
-  if (eob) {
-    if (eob == 1) {
-      vp9_short_idct1_32x32(input, output);
-      vp9_add_constant_residual_32x32(output[0], dest, stride);
-      input[0] = 0;
-    } else {
-      vp9_short_idct32x32_add(input, dest, stride);
-      vpx_memset(input, 0, 2048);
-    }
-  }
-}
-
diff --git a/libvpx/vp9/decoder/vp9_idct_blk.h b/libvpx/vp9/decoder/vp9_idct_blk.h
deleted file mode 100644
index 1810bd0..0000000
--- a/libvpx/vp9/decoder/vp9_idct_blk.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_DECODER_VP9_IDCT_BLK_H_
-#define VP9_DECODER_VP9_IDCT_BLK_H_
-
-#include "vp9/common/vp9_blockd.h"
-
-
-void vp9_idct_add_lossless_c(int16_t *input, unsigned char *dest, int stride,
-                             int eob);
-
-void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,
-                   int stride, int eob);
-
-void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,
-                       int stride, int eob);
-
-void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,
-                         int stride, int eob);
-
-#endif  // VP9_DECODER_VP9_IDCT_BLK_H_
diff --git a/libvpx/vp9/decoder/vp9_onyxd.h b/libvpx/vp9/decoder/vp9_onyxd.h
index cd5b750..a4b9c24 100644
--- a/libvpx/vp9/decoder/vp9_onyxd.h
+++ b/libvpx/vp9/decoder/vp9_onyxd.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ONYXD_H_
-#define VP9_COMMON_VP9_ONYXD_H_
+#ifndef VP9_DECODER_VP9_ONYXD_H_
+#define VP9_DECODER_VP9_ONYXD_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -40,7 +40,7 @@ typedef enum {
 void vp9_initialize_dec();
 
 int vp9_receive_compressed_data(VP9D_PTR comp,
-                                uint64_t size, const uint8_t **dest,
+                                size_t size, const uint8_t **dest,
                                 int64_t time_stamp);
 
 int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,
@@ -66,4 +66,4 @@ void vp9_remove_decompressor(VP9D_PTR comp);
 }
 #endif
 
-#endif  // VP9_COMMON_VP9_ONYXD_H_
+#endif  // VP9_DECODER_VP9_ONYXD_H_
diff --git a/libvpx/vp9/decoder/vp9_onyxd_if.c b/libvpx/vp9/decoder/vp9_onyxd_if.c
index 17d5def..5f970a3 100644
--- a/libvpx/vp9/decoder/vp9_onyxd_if.c
+++ b/libvpx/vp9/decoder/vp9_onyxd_if.c
@@ -65,13 +65,12 @@ static void recon_write_yuv_frame(const char *name,
 #endif
 #if WRITE_RECON_BUFFER == 2
 void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-
   // write the frame
   FILE *yframe;
   int i;
   char filename[255];
 
-  sprintf(filename, "dx\\y%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename)-1, "dx\\y%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->y_height; i++)
@@ -79,7 +78,7 @@ void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
            frame->y_width, 1, yframe);
 
   fclose(yframe);
-  sprintf(filename, "dx\\u%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename)-1, "dx\\u%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->uv_height; i++)
@@ -87,7 +86,7 @@ void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
            frame->uv_width, 1, yframe);
 
   fclose(yframe);
-  sprintf(filename, "dx\\v%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename)-1, "dx\\v%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->uv_height; i++)
@@ -142,20 +141,13 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
   cm->error.setjmp = 0;
   pbi->decoded_key_frame = 0;
 
-  if (pbi->oxcf.max_threads > 1) {
-    vp9_worker_init(&pbi->lf_worker);
-    pbi->lf_worker.data1 = vpx_malloc(sizeof(LFWorkerData));
-    pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
-    if (pbi->lf_worker.data1 == NULL || !vp9_worker_reset(&pbi->lf_worker)) {
-      vp9_remove_decompressor(pbi);
-      return NULL;
-    }
-  }
+  vp9_worker_init(&pbi->lf_worker);
 
   return pbi;
 }
 
 void vp9_remove_decompressor(VP9D_PTR ptr) {
+  int i;
   VP9D_COMP *const pbi = (VP9D_COMP *)ptr;
 
   if (!pbi)
@@ -164,6 +156,16 @@ void vp9_remove_decompressor(VP9D_PTR ptr) {
   vp9_remove_common(&pbi->common);
   vp9_worker_end(&pbi->lf_worker);
   vpx_free(pbi->lf_worker.data1);
+  for (i = 0; i < pbi->num_tile_workers; ++i) {
+    VP9Worker *const worker = &pbi->tile_workers[i];
+    vp9_worker_end(worker);
+    vpx_free(worker->data1);
+    vpx_free(worker->data2);
+  }
+  vpx_free(pbi->tile_workers);
+  vpx_free(pbi->mi_streams);
+  vpx_free(pbi->above_context[0]);
+  vpx_free(pbi->above_seg_context);
   vpx_free(pbi);
 }
 
@@ -177,7 +179,6 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr,
                                        YV12_BUFFER_CONFIG *sd) {
   VP9D_COMP *pbi = (VP9D_COMP *) ptr;
   VP9_COMMON *cm = &pbi->common;
-  int ref_fb_idx;
 
   /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
    * encoder is using the frame buffers for. This is just a stub to keep the
@@ -185,18 +186,15 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr,
    * later commit that adds VP9-specific controls for this functionality.
    */
   if (ref_frame_flag == VP9_LAST_FLAG) {
-    ref_fb_idx = cm->ref_frame_map[0];
+    YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[0]];
+    if (!equal_dimensions(cfg, sd))
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Incorrect buffer dimensions");
+    else
+      vp8_yv12_copy_frame(cfg, sd);
   } else {
     vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                        "Invalid reference frame");
-    return cm->error.error_code;
-  }
-
-  if (!equal_dimensions(&cm->yv12_fb[ref_fb_idx], sd)) {
-    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
-                       "Incorrect buffer dimensions");
-  } else {
-    vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);
   }
 
   return cm->error.error_code;
@@ -214,13 +212,13 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
    * vpxenc --test-decode functionality working, and will be replaced in a
    * later commit that adds VP9-specific controls for this functionality.
    */
-  if (ref_frame_flag == VP9_LAST_FLAG)
+  if (ref_frame_flag == VP9_LAST_FLAG) {
     ref_fb_ptr = &pbi->common.active_ref_idx[0];
-  else if (ref_frame_flag == VP9_GOLD_FLAG)
+  } else if (ref_frame_flag == VP9_GOLD_FLAG) {
     ref_fb_ptr = &pbi->common.active_ref_idx[1];
-  else if (ref_frame_flag == VP9_ALT_FLAG)
+  } else if (ref_frame_flag == VP9_ALT_FLAG) {
     ref_fb_ptr = &pbi->common.active_ref_idx[2];
-  else {
+  } else {
     vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
                        "Invalid reference frame");
     return pbi->common.error.error_code;
@@ -268,7 +266,7 @@ static void swap_frame_buffers(VP9D_COMP *pbi) {
     ++ref_index;
   }
 
-  cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+  cm->frame_to_show = get_frame_new_buffer(cm);
   cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
 
   // Invalidate these references until the next frame starts.
@@ -277,7 +275,7 @@ static void swap_frame_buffers(VP9D_COMP *pbi) {
 }
 
 int vp9_receive_compressed_data(VP9D_PTR ptr,
-                                uint64_t size, const uint8_t **psource,
+                                size_t size, const uint8_t **psource,
                                 int64_t time_stamp) {
   VP9D_COMP *pbi = (VP9D_COMP *) ptr;
   VP9_COMMON *cm = &pbi->common;
@@ -306,7 +304,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
      * thing to do here.
      */
     if (cm->active_ref_idx[0] != INT_MAX)
-      cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1;
+      get_frame_ref_buffer(cm, 0)->corrupted = 1;
   }
 
   cm->new_fb_idx = get_free_fb(cm);
@@ -323,7 +321,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
      * thing to do here.
      */
     if (cm->active_ref_idx[0] != INT_MAX)
-      cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1;
+      get_frame_ref_buffer(cm, 0)->corrupted = 1;
 
     if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
       cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
@@ -343,36 +341,33 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
     return retcode;
   }
 
-  {
-    swap_frame_buffers(pbi);
+  swap_frame_buffers(pbi);
 
 #if WRITE_RECON_BUFFER == 2
-    if (cm->show_frame)
-      write_dx_frame_to_file(cm->frame_to_show,
-                             cm->current_video_frame);
-    else
-      write_dx_frame_to_file(cm->frame_to_show,
-                             cm->current_video_frame + 1000);
+  if (cm->show_frame)
+    write_dx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame);
+  else
+    write_dx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 1000);
 #endif
 
-    if (!pbi->do_loopfilter_inline) {
-      /* Apply the loop filter if appropriate. */
-      vp9_loop_filter_frame(cm, &pbi->mb, pbi->common.lf.filter_level, 0, 0);
-    }
+  if (!pbi->do_loopfilter_inline) {
+    vp9_loop_filter_frame(cm, &pbi->mb, pbi->common.lf.filter_level, 0, 0);
+  }
 
 #if WRITE_RECON_BUFFER == 2
-    if (cm->show_frame)
-      write_dx_frame_to_file(cm->frame_to_show,
-                             cm->current_video_frame + 2000);
-    else
-      write_dx_frame_to_file(cm->frame_to_show,
-                             cm->current_video_frame + 3000);
+  if (cm->show_frame)
+    write_dx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 2000);
+  else
+    write_dx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 3000);
 #endif
 
-    vp9_extend_frame_inner_borders(cm->frame_to_show,
-                                   cm->subsampling_x,
-                                   cm->subsampling_y);
-  }
+  vp9_extend_frame_inner_borders(cm->frame_to_show,
+                                 cm->subsampling_x,
+                                 cm->subsampling_y);
 
 #if WRITE_RECON_BUFFER == 1
   if (cm->show_frame)
@@ -398,6 +393,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
     cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1;
     cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
 
+    pbi->mb.mi_8x8 = cm->mi_grid_visible;
+    pbi->mb.mi_8x8[0] = cm->mi;
+
     cm->current_video_frame++;
   }
 
diff --git a/libvpx/vp9/decoder/vp9_onyxd_int.h b/libvpx/vp9/decoder/vp9_onyxd_int.h
index a051971..7c4c9db 100644
--- a/libvpx/vp9/decoder/vp9_onyxd_int.h
+++ b/libvpx/vp9/decoder/vp9_onyxd_int.h
@@ -25,7 +25,7 @@ typedef struct VP9Decompressor {
   VP9D_CONFIG oxcf;
 
   const uint8_t *source;
-  uint32_t source_sz;
+  size_t source_sz;
 
   int64_t last_time_stamp;
   int ready_for_new_data;
@@ -39,6 +39,18 @@ typedef struct VP9Decompressor {
 
   int do_loopfilter_inline;  // apply loopfilter to available rows immediately
   VP9Worker lf_worker;
+
+  VP9Worker *tile_workers;
+  int num_tile_workers;
+
+  /* Each tile column has its own MODE_INFO stream. This array indexes them by
+     tile column index. */
+  MODE_INFO **mi_streams;
+
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  PARTITION_CONTEXT *above_seg_context;
+
+  DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
 } VP9D_COMP;
 
-#endif  // VP9_DECODER_VP9_TREEREADER_H_
+#endif  // VP9_DECODER_VP9_ONYXD_INT_H_
diff --git a/libvpx/vp9/decoder/vp9_read_bit_buffer.h b/libvpx/vp9/decoder/vp9_read_bit_buffer.h
index c7fa3aa..41a6868 100644
--- a/libvpx/vp9/decoder/vp9_read_bit_buffer.h
+++ b/libvpx/vp9/decoder/vp9_read_bit_buffer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_READ_BIT_BUFFER_
-#define VP9_READ_BIT_BUFFER_
+#ifndef VP9_DECODER_VP9_READ_BIT_BUFFER_H_
+#define VP9_DECODER_VP9_READ_BIT_BUFFER_H_
 
 #include <limits.h>
 
@@ -57,4 +57,4 @@ static int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb,
   return vp9_rb_read_bit(rb) ? -value : value;
 }
 
-#endif  // VP9_READ_BIT_BUFFER_
+#endif  // VP9_DECODER_VP9_READ_BIT_BUFFER_H_
diff --git a/libvpx/vp9/decoder/vp9_thread.c b/libvpx/vp9/decoder/vp9_thread.c
index dc3b681..d953e72 100644
--- a/libvpx/vp9/decoder/vp9_thread.c
+++ b/libvpx/vp9/decoder/vp9_thread.c
@@ -29,7 +29,7 @@ extern "C" {
 //------------------------------------------------------------------------------
 // simplistic pthread emulation layer
 
-#include <process.h>
+#include <process.h>  // NOLINT
 
 // _beginthreadex requires __stdcall
 #define THREADFN unsigned int __stdcall
@@ -145,9 +145,7 @@ static THREADFN thread_loop(void *ptr) {    // thread loop
       pthread_cond_wait(&worker->condition_, &worker->mutex_);
     }
     if (worker->status_ == WORK) {
-      if (worker->hook) {
-        worker->had_error |= !worker->hook(worker->data1, worker->data2);
-      }
+      vp9_worker_execute(worker);
       worker->status_ = OK;
     } else if (worker->status_ == NOT_OK) {   // finish the worker
       done = 1;
@@ -178,7 +176,7 @@ static void change_state(VP9Worker* const worker,
   pthread_mutex_unlock(&worker->mutex_);
 }
 
-#endif
+#endif  // CONFIG_MULTITHREAD
 
 //------------------------------------------------------------------------------
 
@@ -218,12 +216,17 @@ int vp9_worker_reset(VP9Worker* const worker) {
   return ok;
 }
 
+void vp9_worker_execute(VP9Worker* const worker) {
+  if (worker->hook != NULL) {
+    worker->had_error |= !worker->hook(worker->data1, worker->data2);
+  }
+}
+
 void vp9_worker_launch(VP9Worker* const worker) {
 #if CONFIG_MULTITHREAD
   change_state(worker, WORK);
 #else
-  if (worker->hook)
-    worker->had_error |= !worker->hook(worker->data1, worker->data2);
+  vp9_worker_execute(worker);
 #endif
 }
 
diff --git a/libvpx/vp9/decoder/vp9_thread.h b/libvpx/vp9/decoder/vp9_thread.h
index a8f7e04..a624f3c 100644
--- a/libvpx/vp9/decoder/vp9_thread.h
+++ b/libvpx/vp9/decoder/vp9_thread.h
@@ -17,7 +17,7 @@
 #ifndef VP9_DECODER_VP9_THREAD_H_
 #define VP9_DECODER_VP9_THREAD_H_
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -27,7 +27,7 @@ extern "C" {
 
 #if defined(_WIN32)
 
-#include <windows.h>
+#include <windows.h>  // NOLINT
 typedef HANDLE pthread_t;
 typedef CRITICAL_SECTION pthread_mutex_t;
 typedef struct {
@@ -38,7 +38,7 @@ typedef struct {
 
 #else
 
-#include <pthread.h>
+#include <pthread.h> // NOLINT
 
 #endif    /* _WIN32 */
 #endif    /* CONFIG_MULTITHREAD */
@@ -80,6 +80,11 @@ int vp9_worker_sync(VP9Worker* const worker);
 // hook/data1/data2 can be changed at any time before calling this function,
 // but not be changed afterward until the next call to vp9_worker_sync().
 void vp9_worker_launch(VP9Worker* const worker);
+// This function is similar to vp9_worker_launch() except that it calls the
+// hook directly instead of using a thread. Convenient to bypass the thread
+// mechanism while still using the VP9Worker structs. vp9_worker_sync() must
+// still be called afterward (for error reporting).
+void vp9_worker_execute(VP9Worker* const worker);
 // Kill the thread and terminate the object. To use the object again, one
 // must call vp9_worker_reset() again.
 void vp9_worker_end(VP9Worker* const worker);
@@ -90,4 +95,4 @@ void vp9_worker_end(VP9Worker* const worker);
 }    // extern "C"
 #endif
 
-#endif  /* VP9_DECODER_VP9_THREAD_H_ */
+#endif  // VP9_DECODER_VP9_THREAD_H_
diff --git a/libvpx/vp9/decoder/vp9_treereader.h b/libvpx/vp9/decoder/vp9_treereader.h
index 710cc4c..f612497 100644
--- a/libvpx/vp9/decoder/vp9_treereader.h
+++ b/libvpx/vp9/decoder/vp9_treereader.h
@@ -23,7 +23,8 @@ static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
                       const vp9_prob *const p) {
   register vp9_tree_index i = 0;
 
-  while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0);
+  while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0)
+    continue;
 
   return -i;
 }
diff --git a/libvpx/vp9/decoder/x86/vp9_dequantize_sse2.c b/libvpx/vp9/decoder/x86/vp9_dequantize_sse2.c
deleted file mode 100644
index 54ec67f..0000000
--- a/libvpx/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_idct.h"
-
-void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
-                                        int stride) {
-  uint8_t abs_diff;
-  __m128i d;
-
-  // Prediction data.
-  __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
-  __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
-  __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
-  __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
-  __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride));
-  __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride));
-  __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride));
-  __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride));
-
-  p0 = _mm_unpacklo_epi64(p0, p1);
-  p2 = _mm_unpacklo_epi64(p2, p3);
-  p4 = _mm_unpacklo_epi64(p4, p5);
-  p6 = _mm_unpacklo_epi64(p6, p7);
-
-  // Clip diff value to [0, 255] range. Then, do addition or subtraction
-  // according to its sign.
-  if (diff >= 0) {
-    abs_diff = (diff > 255) ? 255 : diff;
-    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-
-    p0 = _mm_adds_epu8(p0, d);
-    p2 = _mm_adds_epu8(p2, d);
-    p4 = _mm_adds_epu8(p4, d);
-    p6 = _mm_adds_epu8(p6, d);
-  } else {
-    abs_diff = (diff < -255) ? 255 : -diff;
-    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-
-    p0 = _mm_subs_epu8(p0, d);
-    p2 = _mm_subs_epu8(p2, d);
-    p4 = _mm_subs_epu8(p4, d);
-    p6 = _mm_subs_epu8(p6, d);
-  }
-
-  _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);
-  p0 = _mm_srli_si128(p0, 8);
-  _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);
-
-  _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);
-  p2 = _mm_srli_si128(p2, 8);
-  _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);
-
-  _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);
-  p4 = _mm_srli_si128(p4, 8);
-  _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);
-
-  _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);
-  p6 = _mm_srli_si128(p6, 8);
-  _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
-}
-
-void vp9_add_constant_residual_16x16_sse2(const int16_t diff, uint8_t *dest,
-                                          int stride) {
-  uint8_t abs_diff;
-  __m128i d;
-
-  // Prediction data.
-  __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
-  __m128i p1 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
-  __m128i p2 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
-  __m128i p3 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
-  __m128i p4 = _mm_load_si128((const __m128i *)(dest + 4 * stride));
-  __m128i p5 = _mm_load_si128((const __m128i *)(dest + 5 * stride));
-  __m128i p6 = _mm_load_si128((const __m128i *)(dest + 6 * stride));
-  __m128i p7 = _mm_load_si128((const __m128i *)(dest + 7 * stride));
-  __m128i p8 = _mm_load_si128((const __m128i *)(dest + 8 * stride));
-  __m128i p9 = _mm_load_si128((const __m128i *)(dest + 9 * stride));
-  __m128i p10 = _mm_load_si128((const __m128i *)(dest + 10 * stride));
-  __m128i p11 = _mm_load_si128((const __m128i *)(dest + 11 * stride));
-  __m128i p12 = _mm_load_si128((const __m128i *)(dest + 12 * stride));
-  __m128i p13 = _mm_load_si128((const __m128i *)(dest + 13 * stride));
-  __m128i p14 = _mm_load_si128((const __m128i *)(dest + 14 * stride));
-  __m128i p15 = _mm_load_si128((const __m128i *)(dest + 15 * stride));
-
-  // Clip diff value to [0, 255] range. Then, do addition or subtraction
-  // according to its sign.
-  if (diff >= 0) {
-    abs_diff = (diff > 255) ? 255 : diff;
-    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-
-    p0 = _mm_adds_epu8(p0, d);
-    p1 = _mm_adds_epu8(p1, d);
-    p2 = _mm_adds_epu8(p2, d);
-    p3 = _mm_adds_epu8(p3, d);
-    p4 = _mm_adds_epu8(p4, d);
-    p5 = _mm_adds_epu8(p5, d);
-    p6 = _mm_adds_epu8(p6, d);
-    p7 = _mm_adds_epu8(p7, d);
-    p8 = _mm_adds_epu8(p8, d);
-    p9 = _mm_adds_epu8(p9, d);
-    p10 = _mm_adds_epu8(p10, d);
-    p11 = _mm_adds_epu8(p11, d);
-    p12 = _mm_adds_epu8(p12, d);
-    p13 = _mm_adds_epu8(p13, d);
-    p14 = _mm_adds_epu8(p14, d);
-    p15 = _mm_adds_epu8(p15, d);
-  } else {
-    abs_diff = (diff < -255) ? 255 : -diff;
-    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-
-    p0 = _mm_subs_epu8(p0, d);
-    p1 = _mm_subs_epu8(p1, d);
-    p2 = _mm_subs_epu8(p2, d);
-    p3 = _mm_subs_epu8(p3, d);
-    p4 = _mm_subs_epu8(p4, d);
-    p5 = _mm_subs_epu8(p5, d);
-    p6 = _mm_subs_epu8(p6, d);
-    p7 = _mm_subs_epu8(p7, d);
-    p8 = _mm_subs_epu8(p8, d);
-    p9 = _mm_subs_epu8(p9, d);
-    p10 = _mm_subs_epu8(p10, d);
-    p11 = _mm_subs_epu8(p11, d);
-    p12 = _mm_subs_epu8(p12, d);
-    p13 = _mm_subs_epu8(p13, d);
-    p14 = _mm_subs_epu8(p14, d);
-    p15 = _mm_subs_epu8(p15, d);
-  }
-
-  // Store results
-  _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
-  _mm_store_si128((__m128i *)(dest + 1 * stride), p1);
-  _mm_store_si128((__m128i *)(dest + 2 * stride), p2);
-  _mm_store_si128((__m128i *)(dest + 3 * stride), p3);
-  _mm_store_si128((__m128i *)(dest + 4 * stride), p4);
-  _mm_store_si128((__m128i *)(dest + 5 * stride), p5);
-  _mm_store_si128((__m128i *)(dest + 6 * stride), p6);
-  _mm_store_si128((__m128i *)(dest + 7 * stride), p7);
-  _mm_store_si128((__m128i *)(dest + 8 * stride), p8);
-  _mm_store_si128((__m128i *)(dest + 9 * stride), p9);
-  _mm_store_si128((__m128i *)(dest + 10 * stride), p10);
-  _mm_store_si128((__m128i *)(dest + 11 * stride), p11);
-  _mm_store_si128((__m128i *)(dest + 12 * stride), p12);
-  _mm_store_si128((__m128i *)(dest + 13 * stride), p13);
-  _mm_store_si128((__m128i *)(dest + 14 * stride), p14);
-  _mm_store_si128((__m128i *)(dest + 15 * stride), p15);
-}
-
-void vp9_add_constant_residual_32x32_sse2(const int16_t diff, uint8_t *dest,
-                                          int stride) {
-  uint8_t abs_diff;
-  __m128i d;
-  int i = 8;
-
-  if (diff >= 0) {
-    abs_diff = (diff > 255) ? 255 : diff;
-    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-  } else {
-    abs_diff = (diff < -255) ? 255 : -diff;
-    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-  }
-
-  do {
-    // Prediction data.
-    __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
-    __m128i p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));
-    __m128i p2 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
-    __m128i p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));
-    __m128i p4 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
-    __m128i p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride + 16));
-    __m128i p6 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
-    __m128i p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride + 16));
-
-    // Clip diff value to [0, 255] range. Then, do addition or subtraction
-    // according to its sign.
-    if (diff >= 0) {
-      p0 = _mm_adds_epu8(p0, d);
-      p1 = _mm_adds_epu8(p1, d);
-      p2 = _mm_adds_epu8(p2, d);
-      p3 = _mm_adds_epu8(p3, d);
-      p4 = _mm_adds_epu8(p4, d);
-      p5 = _mm_adds_epu8(p5, d);
-      p6 = _mm_adds_epu8(p6, d);
-      p7 = _mm_adds_epu8(p7, d);
-    } else {
-      p0 = _mm_subs_epu8(p0, d);
-      p1 = _mm_subs_epu8(p1, d);
-      p2 = _mm_subs_epu8(p2, d);
-      p3 = _mm_subs_epu8(p3, d);
-      p4 = _mm_subs_epu8(p4, d);
-      p5 = _mm_subs_epu8(p5, d);
-      p6 = _mm_subs_epu8(p6, d);
-      p7 = _mm_subs_epu8(p7, d);
-    }
-
-    // Store results
-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
-    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
-    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
-    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
-    _mm_store_si128((__m128i *)(dest + 2 * stride), p4);
-    _mm_store_si128((__m128i *)(dest + 2 * stride + 16), p5);
-    _mm_store_si128((__m128i *)(dest + 3 * stride), p6);
-    _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7);
-
-    dest += 4 * stride;
-  } while (--i);
-}
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index 957cfd2..87bd36c 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -22,7 +22,6 @@
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
-#include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_treecoder.h"
 #include "vp9/common/vp9_systemdependent.h"
@@ -54,8 +53,7 @@ extern unsigned int active_section;
 int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES];
 int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1];
 int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-int64_t switchable_interp_stats[SWITCHABLE_FILTERS+1]
-                               [SWITCHABLE_FILTERS];
+int64_t switchable_interp_stats[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
 
 void init_tx_count_stats() {
   vp9_zero(tx_count_32x32p_stats);
@@ -88,10 +86,9 @@ static void update_tx_count_stats(VP9_COMMON *cm) {
 
 static void update_switchable_interp_stats(VP9_COMMON *cm) {
   int i, j;
-  for (i = 0; i < SWITCHABLE_FILTERS+1; ++i)
-    for (j = 0; j < SWITCHABLE_FILTERS; ++j) {
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    for (j = 0; j < SWITCHABLE_FILTERS; ++j)
       switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j];
-    }
 }
 
 void write_tx_count_stats() {
@@ -141,9 +138,9 @@ void write_switchable_interp_stats() {
   fclose(fp);
 
   printf(
-      "vp9_default_switchable_filter_count[SWITCHABLE_FILTERS+1]"
+      "vp9_default_switchable_filter_count[SWITCHABLE_FILTER_CONTEXTS]"
       "[SWITCHABLE_FILTERS] = {\n");
-  for (i = 0; i < SWITCHABLE_FILTERS+1; i++) {
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
     printf("  { ");
     for (j = 0; j < SWITCHABLE_FILTERS; j++) {
       printf("%"PRId64", ", switchable_interp_stats[i][j]);
@@ -166,34 +163,27 @@ void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
   vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
-static void update_mode(
-  vp9_writer *w,
-  int n,
-  vp9_tree tree,
-  vp9_prob Pnew[/* n-1 */],
-  vp9_prob Pcur[/* n-1 */],
-  unsigned int bct[/* n-1 */] [2],
-  const unsigned int num_events[/* n */]
-) {
+static void update_mode(vp9_writer *w, int n, vp9_tree tree,
+                        vp9_prob Pcur[/* n-1 */],
+                        unsigned int bct[/* n-1 */][2],
+                        const unsigned int num_events[/* n */]) {
   int i = 0;
 
-  vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);
+  vp9_tree_probs_from_distribution(tree, bct, num_events, 0);
   n--;
 
-  for (i = 0; i < n; ++i) {
-    vp9_cond_prob_diff_update(w, &Pcur[i], MODE_UPDATE_PROB, bct[i]);
-  }
+  for (i = 0; i < n; ++i)
+    vp9_cond_prob_diff_update(w, &Pcur[i], bct[i]);
 }
 
 static void update_mbintra_mode_probs(VP9_COMP* const cpi,
                                       vp9_writer* const bc) {
   VP9_COMMON *const cm = &cpi->common;
   int j;
-  vp9_prob pnew[INTRA_MODES - 1];
   unsigned int bct[INTRA_MODES - 1][2];
 
   for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
-    update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, pnew,
+    update_mode(bc, INTRA_MODES, vp9_intra_mode_tree,
                 cm->fc.y_mode_prob[j], bct,
                 (unsigned int *)cpi->y_mode_count[j]);
 }
@@ -228,53 +218,43 @@ void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) {
   int k;
 
   for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-    vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k],
-                              MODE_UPDATE_PROB, cm->counts.mbskip[k]);
+    vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], cm->counts.mbskip[k]);
 }
 
 static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
 }
 
-static void update_switchable_interp_probs(VP9_COMP *const cpi,
-                                           vp9_writer* const bc) {
+static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) {
   VP9_COMMON *const cm = &cpi->common;
-  unsigned int branch_ct[SWITCHABLE_FILTERS + 1]
-                        [SWITCHABLE_FILTERS - 1][2];
-  vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1];
+  unsigned int branch_ct[SWITCHABLE_FILTERS - 1][2];
   int i, j;
-  for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
-    vp9_tree_probs_from_distribution(
-        vp9_switchable_interp_tree,
-        new_prob[j], branch_ct[j],
-        cm->counts.switchable_interp[j], 0);
-  }
-  for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
-    for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
-      vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
-                                MODE_UPDATE_PROB, branch_ct[j][i]);
-    }
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
+    vp9_tree_probs_from_distribution(vp9_switchable_interp_tree, branch_ct,
+                                     cm->counts.switchable_interp[j], 0);
+
+    for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
+      vp9_cond_prob_diff_update(w, &cm->fc.switchable_interp_prob[j][i],
+                                branch_ct[i]);
   }
+
 #ifdef MODE_STATS
   if (!cpi->dummy_packing)
     update_switchable_interp_stats(cm);
 #endif
 }
 
-static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) {
+static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) {
   int i, j;
 
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
     unsigned int branch_ct[INTER_MODES - 1][2];
-    vp9_prob new_prob[INTER_MODES - 1];
-
-    vp9_tree_probs_from_distribution(vp9_inter_mode_tree,
-                                     new_prob, branch_ct,
+    vp9_tree_probs_from_distribution(vp9_inter_mode_tree, branch_ct,
                                      cm->counts.inter_mode[i], NEARESTMV);
 
     for (j = 0; j < INTER_MODES - 1; ++j)
-      vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
-                                MODE_UPDATE_PROB, branch_ct[j]);
+      vp9_cond_prob_diff_update(w, &cm->fc.inter_mode_probs[i][j],
+                                branch_ct[j]);
   }
 }
 
@@ -283,7 +263,7 @@ static void pack_mb_tokens(vp9_writer* const bc,
                            const TOKENEXTRA *const stop) {
   TOKENEXTRA *p = *tp;
 
-  while (p < stop) {
+  while (p < stop && p->token != EOSB_TOKEN) {
     const int t = p->token;
     const struct vp9_token *const a = vp9_coef_encodings + t;
     const vp9_extra_bit *const b = vp9_extra_bits + t;
@@ -293,10 +273,6 @@ static void pack_mb_tokens(vp9_writer* const bc,
     int n = a->len;
     vp9_prob probs[ENTROPY_NODES];
 
-    if (t == EOSB_TOKEN) {
-      ++p;
-      break;
-    }
     if (t >= TWO_TOKEN) {
       vp9_model_to_full_probs(p->context_tree, probs);
       pp = probs;
@@ -338,14 +314,14 @@ static void pack_mb_tokens(vp9_writer* const bc,
     ++p;
   }
 
-  *tp = p;
+  *tp = p + (p->token == EOSB_TOKEN);
 }
 
 static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode,
                             const vp9_prob *p) {
   assert(is_inter_mode(mode));
   write_token(w, vp9_inter_mode_tree, p,
-              &vp9_inter_mode_encodings[mode - NEARESTMV]);
+              &vp9_inter_mode_encodings[inter_mode_offset(mode)]);
 }
 
 
@@ -360,7 +336,7 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mi = &xd->this_mi->mbmi;
+  MB_MODE_INFO *mi = &xd->mi_8x8[0]->mbmi;
   const int segment_id = mi->segment_id;
   int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
                                              SEG_LVL_REF_FRAME);
@@ -393,8 +369,8 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
            mi->ref_frame[0]);
   }
 
-  // if using the prediction mdoel we have nothing further to do because
-  // the reference frame is fully coded by the segment
+  // If using the prediction model we have nothing further to do because
+  // the reference frame is fully coded by the segment.
 }
 
 static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
@@ -409,9 +385,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
   const int segment_id = mi->segment_id;
   int skip_coeff;
   const BLOCK_SIZE bsize = mi->sb_type;
-  const int allow_hp = xd->allow_high_precision_mv;
-
-  x->partition_info = x->pi + (m - cm->mi);
+  const int allow_hp = cm->allow_high_precision_mv;
 
 #ifdef ENTROPY_STATS
   active_section = 9;
@@ -488,17 +462,13 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
     }
 
     if (bsize < BLOCK_8X8) {
-      int j;
-      MB_PREDICTION_MODE blockmode;
-      int_mv blockmv;
       const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
       const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
       int idx, idy;
       for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
         for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
-          j = idy * 2 + idx;
-          blockmode = x->partition_info->bmi[j].mode;
-          blockmv = m->bmi[j].as_mv[0];
+          const int j = idy * 2 + idx;
+          const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
           write_sb_mv_ref(bc, blockmode, mv_ref_p);
           ++cm->counts.inter_mode[mi->mode_context[rf]]
                                  [inter_mode_offset(blockmode)];
@@ -507,14 +477,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
 #ifdef ENTROPY_STATS
             active_section = 11;
 #endif
-            vp9_encode_mv(cpi, bc, &blockmv.as_mv, &mi->best_mv.as_mv,
-                          nmvc, allow_hp);
-
-            if (mi->ref_frame[1] > INTRA_FRAME)
-              vp9_encode_mv(cpi, bc,
-                            &m->bmi[j].as_mv[1].as_mv,
-                            &mi->best_second_mv.as_mv,
-                            nmvc, allow_hp);
+            vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv,
+                          &mi->best_mv[0].as_mv, nmvc, allow_hp);
+
+            if (has_second_ref(mi))
+              vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv,
+                            &mi->best_mv[1].as_mv, nmvc, allow_hp);
           }
         }
       }
@@ -522,12 +490,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
 #ifdef ENTROPY_STATS
       active_section = 5;
 #endif
-      vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv, &mi->best_mv.as_mv,
-                    nmvc, allow_hp);
+      vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv,
+                    &mi->best_mv[0].as_mv, nmvc, allow_hp);
 
-      if (mi->ref_frame[1] > INTRA_FRAME)
-        vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv, &mi->best_second_mv.as_mv,
-                      nmvc, allow_hp);
+      if (has_second_ref(mi))
+        vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv,
+                      &mi->best_mv[1].as_mv, nmvc, allow_hp);
     }
   }
 }
@@ -541,7 +509,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
   const int ym = m->mbmi.mode;
   const int segment_id = m->mbmi.segment_id;
   MODE_INFO *above_mi = mi_8x8[-xd->mode_info_stride];
-  MODE_INFO *left_mi = mi_8x8[-1];
+  MODE_INFO *left_mi = xd->left_available ? mi_8x8[-1] : NULL;
 
   if (seg->update_map)
     write_segment_id(bc, seg, m->mbmi.segment_id);
@@ -553,8 +521,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
 
   if (m->mbmi.sb_type >= BLOCK_8X8) {
     const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0);
-    const MB_PREDICTION_MODE L = xd->left_available ?
-                                 left_block_mode(m, left_mi, 0) : DC_PRED;
+    const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, 0);
     write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]);
   } else {
     int idx, idy;
@@ -564,8 +531,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
       for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
         int i = idy * 2 + idx;
         const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, i);
-        const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
-                                     left_block_mode(m, left_mi, i) : DC_PRED;
+        const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, i);
         const int bm = m->bmi[i].as_mode;
 #ifdef ENTROPY_STATS
         ++intra_mode_stats[A][L][bm];
@@ -578,24 +544,25 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
   write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]);
 }
 
-static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
+static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
+                          MODE_INFO **mi_8x8, vp9_writer *bc,
                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,
-                          int mi_row, int mi_col) {
+                          int mi_row, int mi_col, int index) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   MODE_INFO *m = mi_8x8[0];
 
   if (m->mbmi.sb_type < BLOCK_8X8)
-    if (xd->ab_index > 0)
+    if (index > 0)
       return;
 
-  xd->this_mi = mi_8x8[0];
   xd->mi_8x8 = mi_8x8;
 
-  set_mi_row_col(&cpi->common, xd,
+  set_mi_row_col(xd, tile,
                  mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
-                 mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type]);
-  if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
+                 mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
+                 cm->mi_rows, cm->mi_cols);
+  if (frame_is_intra_only(cm)) {
     write_mb_modes_kf(cpi, mi_8x8, bc);
 #ifdef ENTROPY_STATS
     active_section = 8;
@@ -611,11 +578,35 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
   pack_mb_tokens(bc, tok, tok_end);
 }
 
-static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
+static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col,
+                            PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int ctx = partition_plane_context(cpi->above_seg_context,
+                                          cpi->left_seg_context,
+                                          mi_row, mi_col, bsize);
+  const vp9_prob *const probs = get_partition_probs(cm, ctx);
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+  if (has_rows && has_cols) {
+    write_token(w, vp9_partition_tree, probs, &vp9_partition_encodings[p]);
+  } else if (!has_rows && has_cols) {
+    assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+    vp9_write(w, p == PARTITION_SPLIT, probs[1]);
+  } else if (has_rows && !has_cols) {
+    assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+    vp9_write(w, p == PARTITION_SPLIT, probs[2]);
+  } else {
+    assert(p == PARTITION_SPLIT);
+  }
+}
+
+static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
+                           MODE_INFO **mi_8x8, vp9_writer *bc,
                            TOKENEXTRA **tok, TOKENEXTRA *tok_end,
-                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
+                           int mi_row, int mi_col, BLOCK_SIZE bsize,
+                           int index) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
   const int mis = cm->mode_info_stride;
   int bsl = b_width_log2(bsize);
   int bs = (1 << bsl) / 4;  // mode_info step for subsize
@@ -629,52 +620,37 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
 
   partition = partition_lookup[bsl][m->mbmi.sb_type];
 
-  if (bsize < BLOCK_8X8)
-    if (xd->ab_index > 0)
+  if (bsize < BLOCK_8X8) {
+    if (index > 0)
       return;
-
-  if (bsize >= BLOCK_8X8) {
-    int pl;
-    const int idx = check_bsize_coverage(bs, cm->mi_rows, cm->mi_cols,
-                                         mi_row, mi_col);
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    pl = partition_plane_context(xd, bsize);
-    // encode the partition information
-    if (idx == 0)
-      write_token(bc, vp9_partition_tree,
-                  cm->fc.partition_prob[cm->frame_type][pl],
-                  vp9_partition_encodings + partition);
-    else if (idx > 0)
-      vp9_write(bc, partition == PARTITION_SPLIT,
-                cm->fc.partition_prob[cm->frame_type][pl][idx]);
+  } else {
+    write_partition(cpi, bs, mi_row, mi_col, partition, bsize, bc);
   }
 
   subsize = get_subsize(bsize, partition);
-  *(get_sb_index(xd, subsize)) = 0;
 
   switch (partition) {
     case PARTITION_NONE:
-      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
       break;
     case PARTITION_HORZ:
-      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
-      *(get_sb_index(xd, subsize)) = 1;
+      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
       if ((mi_row + bs) < cm->mi_rows)
-        write_modes_b(cpi, mi_8x8 + bs * mis, bc, tok, tok_end, mi_row + bs,
-                      mi_col);
+        write_modes_b(cpi, tile, mi_8x8 + bs * mis, bc, tok, tok_end,
+                      mi_row + bs, mi_col, 1);
       break;
     case PARTITION_VERT:
-      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
-      *(get_sb_index(xd, subsize)) = 1;
+      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
       if ((mi_col + bs) < cm->mi_cols)
-        write_modes_b(cpi, mi_8x8 + bs, bc, tok, tok_end, mi_row, mi_col + bs);
+        write_modes_b(cpi, tile, mi_8x8 + bs, bc, tok, tok_end,
+                      mi_row, mi_col + bs, 1);
       break;
     case PARTITION_SPLIT:
       for (n = 0; n < 4; n++) {
-        int j = n >> 1, i = n & 0x01;
-        *(get_sb_index(xd, subsize)) = n;
-        write_modes_sb(cpi, mi_8x8 + j * bs * mis + i * bs, bc, tok, tok_end,
-                       mi_row + j * bs, mi_col + i * bs, subsize);
+        const int j = n >> 1, i = n & 1;
+        write_modes_sb(cpi, tile, mi_8x8 + j * bs * mis + i * bs, bc,
+                       tok, tok_end,
+                       mi_row + j * bs, mi_col + i * bs, subsize, n);
       }
       break;
     default:
@@ -683,13 +659,13 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
 
   // update partition context
   if (bsize >= BLOCK_8X8 &&
-      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) {
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    update_partition_context(xd, subsize, bsize);
-  }
+      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
+    update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
+                             mi_row, mi_col, subsize, bsize);
 }
 
-static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
+static void write_modes(VP9_COMP *cpi, const TileInfo *const tile,
+                        vp9_writer* const bc,
                         TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
   VP9_COMMON *const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
@@ -697,57 +673,27 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
   MODE_INFO **mi_8x8 = cm->mi_grid_visible;
   MODE_INFO **m_8x8;
 
-  mi_8x8 += cm->cur_tile_mi_col_start + cm->cur_tile_mi_row_start * mis;
+  mi_8x8 += tile->mi_col_start + tile->mi_row_start * mis;
 
-  for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
+  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += 8, mi_8x8 += 8 * mis) {
     m_8x8 = mi_8x8;
-    vp9_zero(cm->left_seg_context);
-    for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+    vp9_zero(cpi->left_seg_context);
+    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) {
-      write_modes_sb(cpi, m_8x8, bc, tok, tok_end, mi_row, mi_col,
-                     BLOCK_64X64);
+      write_modes_sb(cpi, tile, m_8x8, bc, tok, tok_end, mi_row, mi_col,
+                     BLOCK_64X64, 0);
     }
   }
 }
 
-/* This function is used for debugging probability trees. */
-static void print_prob_tree(vp9_coeff_probs *coef_probs, int block_types) {
-  /* print coef probability tree */
-  int i, j, k, l, m;
-  FILE *f = fopen("enc_tree_probs.txt", "a");
-  fprintf(f, "{\n");
-  for (i = 0; i < block_types; i++) {
-    fprintf(f, "  {\n");
-    for (j = 0; j < REF_TYPES; ++j) {
-      fprintf(f, "  {\n");
-      for (k = 0; k < COEF_BANDS; k++) {
-        fprintf(f, "    {\n");
-        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
-          fprintf(f, "      {");
-          for (m = 0; m < ENTROPY_NODES; m++) {
-            fprintf(f, "%3u, ",
-                    (unsigned int)(coef_probs[i][j][k][l][m]));
-          }
-        }
-        fprintf(f, " }\n");
-      }
-      fprintf(f, "    }\n");
-    }
-    fprintf(f, "  }\n");
-  }
-  fprintf(f, "}\n");
-  fclose(f);
-}
-
 static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
   vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[tx_size];
   vp9_coeff_count *coef_counts = cpi->coef_counts[tx_size];
   unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
       cpi->common.counts.eob_branch[tx_size];
   vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[tx_size];
-  vp9_prob full_probs[ENTROPY_NODES];
-  int i, j, k, l;
+  int i, j, k, l, m;
 
   for (i = 0; i < BLOCK_TYPES; ++i) {
     for (j = 0; j < REF_TYPES; ++j) {
@@ -756,16 +702,14 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
           if (l >= 3 && k == 0)
             continue;
           vp9_tree_probs_from_distribution(vp9_coef_tree,
-                                           full_probs,
                                            coef_branch_ct[i][j][k][l],
                                            coef_counts[i][j][k][l], 0);
-          vpx_memcpy(coef_probs[i][j][k][l], full_probs,
-                     sizeof(vp9_prob) * UNCONSTRAINED_NODES);
           coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
                                              coef_branch_ct[i][j][k][l][0][0];
-          coef_probs[i][j][k][l][0] =
-              get_binary_prob(coef_branch_ct[i][j][k][l][0][0],
-                              coef_branch_ct[i][j][k][l][0][1]);
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            coef_probs[i][j][k][l][m] = get_binary_prob(
+                                            coef_branch_ct[i][j][k][l][m][0],
+                                            coef_branch_ct[i][j][k][l][m][1]);
 #ifdef ENTROPY_STATS
           if (!cpi->dummy_packing) {
             int t;
@@ -794,7 +738,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
   vp9_coeff_probs_model *old_frame_coef_probs =
       cpi->common.fc.coef_probs[tx_size];
   vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size];
-  const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+  const vp9_prob upd = DIFF_UPDATE_PROB;
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
   int i, j, k, l, t;
   switch (cpi->sf.use_fast_coef_updates) {
@@ -849,7 +793,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
                 vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
-                const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+                const vp9_prob upd = DIFF_UPDATE_PROB;
                 int s;
                 int u = 0;
                 if (l >= 3 && k == 0)
@@ -1132,26 +1076,23 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
 
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i],
-                                     ct_8x8p);
+      tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; j++)
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j],
-                                  MODE_UPDATE_PROB, ct_8x8p[j]);
+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], ct_8x8p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i],
-                                       ct_16x16p);
+      tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; j++)
         vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
-                                  MODE_UPDATE_PROB, ct_16x16p[j]);
+                                  ct_16x16p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
       tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; j++)
         vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
-                                  MODE_UPDATE_PROB, ct_32x32p[j]);
+                                  ct_32x32p[j]);
     }
 #ifdef MODE_STATS
     if (!cpi->dummy_packing)
@@ -1160,9 +1101,9 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
   }
 }
 
-static void write_interp_filter_type(INTERPOLATIONFILTERTYPE type,
+static void write_interp_filter_type(INTERPOLATION_TYPE type,
                                      struct vp9_write_bit_buffer *wb) {
-  const int type_to_literal[] = { 1, 0, 2 };
+  const int type_to_literal[] = { 1, 0, 2, 3 };
 
   vp9_wb_write_bit(wb, type == SWITCHABLE);
   if (type != SWITCHABLE)
@@ -1178,7 +1119,7 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) {
     int i, j, c = 0;
     for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
       count[i] = 0;
-      for (j = 0; j <= SWITCHABLE_FILTERS; ++j)
+      for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
         count[i] += cm->counts.switchable_interp[j][i];
       c += (count[i] > 0);
     }
@@ -1258,7 +1199,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
 
-  vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+  vpx_memset(cpi->above_seg_context, 0, sizeof(*cpi->above_seg_context) *
              mi_cols_aligned_to_sb(cm->mi_cols));
 
   tok[0][0] = cpi->tok;
@@ -1273,9 +1214,10 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   }
 
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-    vp9_get_tile_row_offsets(cm, tile_row);
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-      vp9_get_tile_col_offsets(cm, tile_col);
+      TileInfo tile;
+
+      vp9_tile_init(&tile, cm, 0, tile_col);
       tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
@@ -1283,7 +1225,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
       else
         vp9_start_encode(&residual_bc, data_ptr + total_size);
 
-      write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end);
+      write_modes(cpi, &tile, &residual_bc, &tok[tile_row][tile_col], tok_end);
       assert(tok[tile_row][tile_col] == tok_end);
       vp9_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
@@ -1352,18 +1294,16 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
 }
 
 static void write_sync_code(struct vp9_write_bit_buffer *wb) {
-  vp9_wb_write_literal(wb, SYNC_CODE_0, 8);
-  vp9_wb_write_literal(wb, SYNC_CODE_1, 8);
-  vp9_wb_write_literal(wb, SYNC_CODE_2, 8);
+  vp9_wb_write_literal(wb, VP9_SYNC_CODE_0, 8);
+  vp9_wb_write_literal(wb, VP9_SYNC_CODE_1, 8);
+  vp9_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
 }
 
 static void write_uncompressed_header(VP9_COMP *cpi,
                                       struct vp9_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
-  // frame marker bits
-  vp9_wb_write_literal(wb, 0x2, 2);
+  vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
 
   // bitstream version.
   // 00 - profile 0. 4:2:0 only
@@ -1377,18 +1317,10 @@ static void write_uncompressed_header(VP9_COMP *cpi,
   vp9_wb_write_bit(wb, cm->error_resilient_mode);
 
   if (cm->frame_type == KEY_FRAME) {
+    const COLOR_SPACE cs = UNKNOWN;
     write_sync_code(wb);
-    // colorspaces
-    // 000 - Unknown
-    // 001 - BT.601
-    // 010 - BT.709
-    // 011 - SMPTE-170
-    // 100 - SMPTE-240
-    // 101 - Reserved
-    // 110 - Reserved
-    // 111 - sRGB (RGB)
-    vp9_wb_write_literal(wb, 0, 3);
-    if (1 /* colorspace != sRGB */) {
+    vp9_wb_write_literal(wb, cs, 3);
+    if (cs != SRGB) {
       vp9_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
       if (cm->version == 1) {
         vp9_wb_write_bit(wb, cm->subsampling_x);
@@ -1425,7 +1357,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
 
       write_frame_size_with_refs(cpi, wb);
 
-      vp9_wb_write_bit(wb, xd->allow_high_precision_mv);
+      vp9_wb_write_bit(wb, cm->allow_high_precision_mv);
 
       fix_mcomp_filter_type(cpi);
       write_interp_filter_type(cm->mcomp_filter_type, wb);
@@ -1467,7 +1399,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
 
   vp9_update_skip_probs(cpi, &header_bc);
 
-  if (cm->frame_type != KEY_FRAME) {
+  if (!frame_is_intra_only(cm)) {
     int i;
 #ifdef ENTROPY_STATS
     active_section = 1;
@@ -1481,7 +1413,6 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
-                                MODE_UPDATE_PROB,
                                 cpi->intra_inter_count[i]);
 
     if (cm->allow_comp_inter_inter) {
@@ -1495,7 +1426,6 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
         if (use_hybrid_pred)
           for (i = 0; i < COMP_INTER_CONTEXTS; i++)
             vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
-                                      MODE_UPDATE_PROB,
                                       cpi->comp_inter_count[i]);
       }
     }
@@ -1503,10 +1433,8 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
     if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
       for (i = 0; i < REF_CONTEXTS; i++) {
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
-                                  MODE_UPDATE_PROB,
                                   cpi->single_ref_count[i][0]);
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
-                                  MODE_UPDATE_PROB,
                                   cpi->single_ref_count[i][1]);
       }
     }
@@ -1514,21 +1442,18 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
     if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
       for (i = 0; i < REF_CONTEXTS; i++)
         vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
-                                  MODE_UPDATE_PROB,
                                   cpi->comp_ref_count[i]);
 
     update_mbintra_mode_probs(cpi, &header_bc);
 
-    for (i = 0; i < NUM_PARTITION_CONTEXTS; ++i) {
-      vp9_prob pnew[PARTITION_TYPES - 1];
+    for (i = 0; i < PARTITION_CONTEXTS; ++i) {
       unsigned int bct[PARTITION_TYPES - 1][2];
-      update_mode(&header_bc, PARTITION_TYPES,
-                  vp9_partition_tree, pnew,
-                  fc->partition_prob[cm->frame_type][i], bct,
+      update_mode(&header_bc, PARTITION_TYPES, vp9_partition_tree,
+                  fc->partition_prob[i], bct,
                   (unsigned int *)cpi->partition_count[i]);
     }
 
-    vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc);
+    vp9_write_nmv_probs(cpi, cm->allow_high_precision_mv, &header_bc);
   }
 
   vp9_stop_encode(&header_bc);
diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h
index 013047e..8033a4d 100644
--- a/libvpx/vp9/encoder/vp9_block.h
+++ b/libvpx/vp9/encoder/vp9_block.h
@@ -23,17 +23,11 @@ typedef struct {
   int offset;
 } search_site;
 
-typedef struct {
-  struct {
-    MB_PREDICTION_MODE mode;
-  } bmi[4];
-} PARTITION_INFO;
-
 // Structure to hold snapshot of coding context during the mode picking process
-// TODO Do we need all of these?
 typedef struct {
   MODE_INFO mic;
-  PARTITION_INFO partition_info;
+  uint8_t *zcoeff_blk;
+  int num_4x4_blk;
   int skip;
   int_mv best_ref_mv;
   int_mv second_best_ref_mv;
@@ -48,7 +42,7 @@ typedef struct {
   int comp_pred_diff;
   int single_pred_diff;
   int64_t tx_rd_diff[TX_MODES];
-  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
 
   // motion vector cache for adaptive motion search control in partition
   // search loop
@@ -62,8 +56,8 @@ typedef struct {
 } PICK_MODE_CONTEXT;
 
 struct macroblock_plane {
-  DECLARE_ALIGNED(16, int16_t, src_diff[64*64]);
-  DECLARE_ALIGNED(16, int16_t, coeff[64*64]);
+  DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
+  DECLARE_ALIGNED(16, int16_t, coeff[64 * 64]);
   struct buf_2d src;
 
   // Quantizer setings
@@ -87,9 +81,6 @@ struct macroblock {
 
   MACROBLOCKD e_mbd;
   int skip_block;
-  PARTITION_INFO *partition_info; /* work pointer */
-  PARTITION_INFO *pi;   /* Corresponds to upper left visible macroblock */
-  PARTITION_INFO *pip;  /* Base of allocated array */
 
   search_site *ss;
   int ss_count;
@@ -100,6 +91,7 @@ struct macroblock {
   int sadperbit4;
   int rddiv;
   int rdmult;
+  unsigned int mb_energy;
   unsigned int *mb_activity_ptr;
   int *mb_norm_activity_ptr;
   signed int act_zbin_adj;
@@ -123,11 +115,10 @@ struct macroblock {
   int **mvsadcost;
 
   int mbmode_cost[MB_MODE_COUNT];
-  unsigned inter_mode_cost[INTER_MODE_CONTEXTS][MB_MODE_COUNT - NEARESTMV];
+  unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
-  int switchable_interp_costs[SWITCHABLE_FILTERS + 1]
-                             [SWITCHABLE_FILTERS];
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
 
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
@@ -136,6 +127,7 @@ struct macroblock {
   int mv_row_min;
   int mv_row_max;
 
+  uint8_t zcoeff_blk[TX_SIZES][256];
   int skip;
 
   int encode_breakout;
@@ -144,6 +136,7 @@ struct macroblock {
 
   // note that token_costs is the cost when eob node is skipped
   vp9_coeff_cost token_costs[TX_SIZES];
+  DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
 
   int optimize;
 
@@ -172,19 +165,72 @@ struct macroblock {
   PICK_MODE_CONTEXT sb32x64_context[2];
   PICK_MODE_CONTEXT sb64x32_context[2];
   PICK_MODE_CONTEXT sb64_context;
-  int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
+  int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
 
   BLOCK_SIZE b_partitioning[4][4][4];
   BLOCK_SIZE mb_partitioning[4][4];
   BLOCK_SIZE sb_partitioning[4];
   BLOCK_SIZE sb64_partitioning;
 
-  void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
-  void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
-  void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);
-  void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch);
-  void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
-                         int y_blocks);
+  void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
+};
+
+// TODO(jingning): the variables used here are little complicated. need further
+// refactoring on organizing the temporary buffers, when recursive
+// partition down to 4x4 block size is enabled.
+static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  switch (bsize) {
+    case BLOCK_64X64:
+      return &x->sb64_context;
+    case BLOCK_64X32:
+      return &x->sb64x32_context[xd->sb_index];
+    case BLOCK_32X64:
+      return &x->sb32x64_context[xd->sb_index];
+    case BLOCK_32X32:
+      return &x->sb32_context[xd->sb_index];
+    case BLOCK_32X16:
+      return &x->sb32x16_context[xd->sb_index][xd->mb_index];
+    case BLOCK_16X32:
+      return &x->sb16x32_context[xd->sb_index][xd->mb_index];
+    case BLOCK_16X16:
+      return &x->mb_context[xd->sb_index][xd->mb_index];
+    case BLOCK_16X8:
+      return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_8X16:
+      return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_8X8:
+      return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_8X4:
+      return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_4X8:
+      return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_4X4:
+      return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+struct rdcost_block_args {
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[16];
+  ENTROPY_CONTEXT t_left[16];
+  TX_SIZE tx_size;
+  int bw;
+  int bh;
+  int rate;
+  int64_t dist;
+  int64_t sse;
+  int this_rate;
+  int64_t this_dist;
+  int64_t this_sse;
+  int64_t this_rd;
+  int64_t best_rd;
+  int skip;
+  const int16_t *scan, *nb;
 };
 
 #endif  // VP9_ENCODER_VP9_BLOCK_H_
diff --git a/libvpx/vp9/encoder/vp9_boolhuff.c b/libvpx/vp9/encoder/vp9_boolhuff.c
index 0f1aa59..32c136e 100644
--- a/libvpx/vp9/encoder/vp9_boolhuff.c
+++ b/libvpx/vp9/encoder/vp9_boolhuff.c
@@ -22,23 +22,28 @@ unsigned int active_section = 0;
 #endif
 
 const unsigned int vp9_prob_cost[256] = {
-  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
-  1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
-  767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
-  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
-  511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
-  428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
-  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
-  304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
-  255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
-  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
-  172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
-  137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
-  105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
-  75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
-  48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
-  22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
-};
+  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161,
+  1129, 1099, 1072, 1046, 1023, 1000, 979,  959,  940,  922,  905,  889,
+  873,  858,  843,  829,  816,  803,  790,  778,  767,  755,  744,  733,
+  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
+  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,
+  534,  528,  522,  516,  511,  505,  499,  494,  488,  483,  477,  472,
+  467,  462,  457,  452,  447,  442,  437,  433,  428,  424,  419,  415,
+  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
+  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,
+  317,  314,  311,  307,  304,  301,  297,  294,  291,  288,  285,  281,
+  278,  275,  272,  269,  266,  263,  260,  257,  255,  252,  249,  246,
+  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
+  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,
+  181,  179,  177,  174,  172,  170,  168,  165,  163,  161,  159,  156,
+  154,  152,  150,  148,  145,  143,  141,  139,  137,  135,  133,  131,
+  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
+  105,  103,  101,  99,   97,   95,   93,   92,   90,   88,   86,   84,
+  82,   81,   79,   77,   75,   73,   72,   70,   68,   66,   65,   63,
+  61,   60,   58,   56,   55,   53,   51,   50,   48,   46,   45,   43,
+  41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
+  22,   21,   19,   18,   16,   15,   13,   12,   10,   9,    7,    6,
+  4,    3,    1,    1};
 
 void vp9_start_encode(vp9_writer *br, uint8_t *source) {
   br->lowvalue = 0;
diff --git a/libvpx/vp9/encoder/vp9_dct.c b/libvpx/vp9/encoder/vp9_dct.c
index 4f4ad04..065992a 100644
--- a/libvpx/vp9/encoder/vp9_dct.c
+++ b/libvpx/vp9/encoder/vp9_dct.c
@@ -8,16 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include <assert.h>
 #include <math.h>
+
 #include "./vpx_config.h"
-#include "vp9/common/vp9_systemdependent.h"
+#include "./vp9_rtcd.h"
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_systemdependent.h"
 
-static void fdct4_1d(int16_t *input, int16_t *output) {
+#include "vp9/encoder/vp9_dct.h"
+
+static void fdct4(const int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
 
@@ -36,18 +39,17 @@ static void fdct4_1d(int16_t *input, int16_t *output) {
   output[3] = dct_const_round_shift(temp2);
 }
 
-void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
   // as the first pass results are transposed, we tranpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
-  const int stride = pitch >> 1;
   int pass;
   // We need an intermediate buffer between passes.
   int16_t intermediate[4 * 4];
-  int16_t *in = input;
+  const int16_t *in = input;
   int16_t *out = intermediate;
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
@@ -58,10 +60,10 @@ void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {
     for (i = 0; i < 4; ++i) {
       // Load inputs.
       if (0 == pass) {
-        input[0] = in[0 * stride] << 4;
-        input[1] = in[1 * stride] << 4;
-        input[2] = in[2 * stride] << 4;
-        input[3] = in[3 * stride] << 4;
+        input[0] = in[0 * stride] * 16;
+        input[1] = in[1 * stride] * 16;
+        input[2] = in[2 * stride] * 16;
+        input[3] = in[3 * stride] * 16;
         if (i == 0 && input[0]) {
           input[0] += 1;
         }
@@ -102,7 +104,7 @@ void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-static void fadst4_1d(int16_t *input, int16_t *output) {
+static void fadst4(const int16_t *input, int16_t *output) {
   int x0, x1, x2, x3;
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -143,14 +145,14 @@ static void fadst4_1d(int16_t *input, int16_t *output) {
 }
 
 static const transform_2d FHT_4[] = {
-  { fdct4_1d,  fdct4_1d  },  // DCT_DCT  = 0
-  { fadst4_1d, fdct4_1d  },  // ADST_DCT = 1
-  { fdct4_1d,  fadst4_1d },  // DCT_ADST = 2
-  { fadst4_1d, fadst4_1d }   // ADST_ADST = 3
+  { fdct4,  fdct4  },  // DCT_DCT  = 0
+  { fadst4, fdct4  },  // ADST_DCT = 1
+  { fdct4,  fadst4 },  // DCT_ADST = 2
+  { fadst4, fadst4 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
-                        int pitch, TX_TYPE tx_type) {
+void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,
+                        int stride, int tx_type) {
   int16_t out[4 * 4];
   int16_t *outptr = &out[0];
   int i, j;
@@ -160,7 +162,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
   // Columns
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
-      temp_in[j] = input[j * pitch + i] << 4;
+      temp_in[j] = input[j * stride + i] * 16;
     if (i == 0 && temp_in[0])
       temp_in[0] += 1;
     ht.cols(temp_in, temp_out);
@@ -178,12 +180,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
   }
 }
 
-void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) {
-    vp9_short_fdct4x4_c(input, output, pitch);
-    vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
-}
-
-static void fdct8_1d(int16_t *input, int16_t *output) {
+static void fdct8(const int16_t *input, int16_t *output) {
   /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
   /*needs32*/ int t0, t1, t2, t3;
   /*canbe16*/ int x0, x1, x2, x3;
@@ -198,7 +195,7 @@ static void fdct8_1d(int16_t *input, int16_t *output) {
   s6 = input[1] - input[6];
   s7 = input[0] - input[7];
 
-  // fdct4_1d(step, step);
+  // fdct4(step, step);
   x0 = s0 + s3;
   x1 = s1 + s2;
   x2 = s1 - s2;
@@ -235,8 +232,7 @@ static void fdct8_1d(int16_t *input, int16_t *output) {
   output[7] = dct_const_round_shift(t3);
 }
 
-void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) {
-  const int stride = pitch >> 1;
+void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
   int i, j;
   int16_t intermediate[64];
 
@@ -250,16 +246,16 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) {
     int i;
     for (i = 0; i < 8; i++) {
       // stage 1
-      s0 = (input[0 * stride] + input[7 * stride]) << 2;
-      s1 = (input[1 * stride] + input[6 * stride]) << 2;
-      s2 = (input[2 * stride] + input[5 * stride]) << 2;
-      s3 = (input[3 * stride] + input[4 * stride]) << 2;
-      s4 = (input[3 * stride] - input[4 * stride]) << 2;
-      s5 = (input[2 * stride] - input[5 * stride]) << 2;
-      s6 = (input[1 * stride] - input[6 * stride]) << 2;
-      s7 = (input[0 * stride] - input[7 * stride]) << 2;
-
-      // fdct4_1d(step, step);
+      s0 = (input[0 * stride] + input[7 * stride]) * 4;
+      s1 = (input[1 * stride] + input[6 * stride]) * 4;
+      s2 = (input[2 * stride] + input[5 * stride]) * 4;
+      s3 = (input[3 * stride] + input[4 * stride]) * 4;
+      s4 = (input[3 * stride] - input[4 * stride]) * 4;
+      s5 = (input[2 * stride] - input[5 * stride]) * 4;
+      s6 = (input[1 * stride] - input[6 * stride]) * 4;
+      s7 = (input[0 * stride] - input[7 * stride]) * 4;
+
+      // fdct4(step, step);
       x0 = s0 + s3;
       x1 = s1 + s2;
       x2 = s1 - s2;
@@ -301,24 +297,23 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) {
 
   // Rows
   for (i = 0; i < 8; ++i) {
-    fdct8_1d(&intermediate[i * 8], &final_output[i * 8]);
+    fdct8(&intermediate[i * 8], &final_output[i * 8]);
     for (j = 0; j < 8; ++j)
       final_output[j + i * 8] /= 2;
   }
 }
 
-void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
   // as the first pass results are transposed, we tranpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
-  const int stride = pitch >> 1;
   int pass;
   // We need an intermediate buffer between passes.
   int16_t intermediate[256];
-  int16_t *in = input;
+  const int16_t *in = input;
   int16_t *out = intermediate;
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
@@ -331,23 +326,23 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
     for (i = 0; i < 16; i++) {
       if (0 == pass) {
         // Calculate input for the first 8 results.
-        input[0] = (in[0 * stride] + in[15 * stride]) << 2;
-        input[1] = (in[1 * stride] + in[14 * stride]) << 2;
-        input[2] = (in[2 * stride] + in[13 * stride]) << 2;
-        input[3] = (in[3 * stride] + in[12 * stride]) << 2;
-        input[4] = (in[4 * stride] + in[11 * stride]) << 2;
-        input[5] = (in[5 * stride] + in[10 * stride]) << 2;
-        input[6] = (in[6 * stride] + in[ 9 * stride]) << 2;
-        input[7] = (in[7 * stride] + in[ 8 * stride]) << 2;
+        input[0] = (in[0 * stride] + in[15 * stride]) * 4;
+        input[1] = (in[1 * stride] + in[14 * stride]) * 4;
+        input[2] = (in[2 * stride] + in[13 * stride]) * 4;
+        input[3] = (in[3 * stride] + in[12 * stride]) * 4;
+        input[4] = (in[4 * stride] + in[11 * stride]) * 4;
+        input[5] = (in[5 * stride] + in[10 * stride]) * 4;
+        input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
+        input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
         // Calculate input for the next 8 results.
-        step1[0] = (in[7 * stride] - in[ 8 * stride]) << 2;
-        step1[1] = (in[6 * stride] - in[ 9 * stride]) << 2;
-        step1[2] = (in[5 * stride] - in[10 * stride]) << 2;
-        step1[3] = (in[4 * stride] - in[11 * stride]) << 2;
-        step1[4] = (in[3 * stride] - in[12 * stride]) << 2;
-        step1[5] = (in[2 * stride] - in[13 * stride]) << 2;
-        step1[6] = (in[1 * stride] - in[14 * stride]) << 2;
-        step1[7] = (in[0 * stride] - in[15 * stride]) << 2;
+        step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
+        step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
+        step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
+        step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
+        step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
+        step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
+        step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
+        step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
       } else {
         // Calculate input for the first 8 results.
         input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
@@ -368,7 +363,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
         step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
         step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
       }
-      // Work on the first eight values; fdct8_1d(input, even_results);
+      // Work on the first eight values; fdct8(input, even_results);
       {
         /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
         /*needs32*/ int t0, t1, t2, t3;
@@ -384,7 +379,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
         s6 = input[1] - input[6];
         s7 = input[0] - input[7];
 
-        // fdct4_1d(step, step);
+        // fdct4(step, step);
         x0 = s0 + s3;
         x1 = s1 + s2;
         x2 = s1 - s2;
@@ -486,7 +481,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-static void fadst8_1d(int16_t *input, int16_t *output) {
+static void fadst8(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[7];
@@ -558,14 +553,14 @@ static void fadst8_1d(int16_t *input, int16_t *output) {
 }
 
 static const transform_2d FHT_8[] = {
-  { fdct8_1d,  fdct8_1d  },  // DCT_DCT  = 0
-  { fadst8_1d, fdct8_1d  },  // ADST_DCT = 1
-  { fdct8_1d,  fadst8_1d },  // DCT_ADST = 2
-  { fadst8_1d, fadst8_1d }   // ADST_ADST = 3
+  { fdct8,  fdct8  },  // DCT_DCT  = 0
+  { fadst8, fdct8  },  // ADST_DCT = 1
+  { fdct8,  fadst8 },  // DCT_ADST = 2
+  { fadst8, fadst8 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
-                        int pitch, TX_TYPE tx_type) {
+void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,
+                        int stride, int tx_type) {
   int16_t out[64];
   int16_t *outptr = &out[0];
   int i, j;
@@ -575,7 +570,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
   // Columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
-      temp_in[j] = input[j * pitch + i] << 2;
+      temp_in[j] = input[j * stride + i] * 4;
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
       outptr[j * 8 + i] = temp_out[j];
@@ -593,18 +588,17 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
 
 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
    pixel. */
-void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
+void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
   int i;
   int a1, b1, c1, d1, e1;
-  short *ip = input;
-  short *op = output;
-  int pitch_short = pitch >> 1;
+  const int16_t *ip = input;
+  int16_t *op = output;
 
   for (i = 0; i < 4; i++) {
-    a1 = ip[0 * pitch_short];
-    b1 = ip[1 * pitch_short];
-    c1 = ip[2 * pitch_short];
-    d1 = ip[3 * pitch_short];
+    a1 = ip[0 * stride];
+    b1 = ip[1 * stride];
+    c1 = ip[2 * stride];
+    d1 = ip[3 * stride];
 
     a1 += b1;
     d1 = d1 - c1;
@@ -637,24 +631,18 @@ void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
     c1 = e1 - c1;
     a1 -= c1;
     d1 += b1;
-    op[0] = a1 << WHT_UPSCALE_FACTOR;
-    op[1] = c1 << WHT_UPSCALE_FACTOR;
-    op[2] = d1 << WHT_UPSCALE_FACTOR;
-    op[3] = b1 << WHT_UPSCALE_FACTOR;
+    op[0] = a1 * UNIT_QUANT_FACTOR;
+    op[1] = c1 * UNIT_QUANT_FACTOR;
+    op[2] = d1 * UNIT_QUANT_FACTOR;
+    op[3] = b1 * UNIT_QUANT_FACTOR;
 
     ip += 4;
     op += 4;
   }
 }
 
-void vp9_short_walsh8x4_c(short *input, short *output, int pitch) {
-  vp9_short_walsh4x4_c(input,   output,    pitch);
-  vp9_short_walsh4x4_c(input + 4, output + 16, pitch);
-}
-
-
 // Rewrote to use same algorithm as others.
-static void fdct16_1d(int16_t in[16], int16_t out[16]) {
+static void fdct16(const int16_t in[16], int16_t out[16]) {
   /*canbe16*/ int step1[8];
   /*canbe16*/ int step2[8];
   /*canbe16*/ int step3[8];
@@ -680,7 +668,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) {
   step1[6] = in[1] - in[14];
   step1[7] = in[0] - in[15];
 
-  // fdct8_1d(step, step);
+  // fdct8(step, step);
   {
     /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
     /*needs32*/ int t0, t1, t2, t3;
@@ -696,7 +684,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) {
     s6 = input[1] - input[6];
     s7 = input[0] - input[7];
 
-    // fdct4_1d(step, step);
+    // fdct4(step, step);
     x0 = s0 + s3;
     x1 = s1 + s2;
     x2 = s1 - s2;
@@ -795,7 +783,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) {
   out[15] = dct_const_round_shift(temp2);
 }
 
-void fadst16_1d(int16_t *input, int16_t *output) {
+static void fadst16(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -958,14 +946,14 @@ void fadst16_1d(int16_t *input, int16_t *output) {
 }
 
 static const transform_2d FHT_16[] = {
-  { fdct16_1d,  fdct16_1d  },  // DCT_DCT  = 0
-  { fadst16_1d, fdct16_1d  },  // ADST_DCT = 1
-  { fdct16_1d,  fadst16_1d },  // DCT_ADST = 2
-  { fadst16_1d, fadst16_1d }   // ADST_ADST = 3
+  { fdct16,  fdct16  },  // DCT_DCT  = 0
+  { fadst16, fdct16  },  // ADST_DCT = 1
+  { fdct16,  fadst16 },  // DCT_ADST = 2
+  { fadst16, fadst16 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
-                          int pitch, TX_TYPE tx_type) {
+void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,
+                          int stride, int tx_type) {
   int16_t out[256];
   int16_t *outptr = &out[0];
   int i, j;
@@ -975,7 +963,7 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
   // Columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
-      temp_in[j] = input[j * pitch + i] << 2;
+      temp_in[j] = input[j * stride + i] * 4;
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
       outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
@@ -1003,7 +991,7 @@ static INLINE int half_round_shift(int input) {
   return rv;
 }
 
-static void dct32_1d(int *input, int *output, int round) {
+static void dct32_1d(const int *input, int *output, int round) {
   int step[32];
   // Stage 1
   step[0] = input[0] + input[(32 - 1)];
@@ -1326,8 +1314,7 @@ static void dct32_1d(int *input, int *output, int round) {
   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
 }
 
-void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
-  int shortpitch = pitch >> 1;
+void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
   int i, j;
   int output[32 * 32];
 
@@ -1335,7 +1322,7 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
   for (i = 0; i < 32; ++i) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
-      temp_in[j] = input[j * shortpitch + i] << 2;
+      temp_in[j] = input[j * stride + i] * 4;
     dct32_1d(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
@@ -1355,8 +1342,7 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
 // Note that although we use dct_32_round in dct32_1d computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
-void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
-  int shortpitch = pitch >> 1;
+void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
   int i, j;
   int output[32 * 32];
 
@@ -1364,7 +1350,7 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
   for (i = 0; i < 32; ++i) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
-      temp_in[j] = input[j * shortpitch + i] << 2;
+      temp_in[j] = input[j * stride + i] * 4;
     dct32_1d(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       // TODO(cd): see quality impact of only doing
@@ -1383,3 +1369,27 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
       out[j + i * 32] = temp_out[j];
   }
 }
+
+void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+                int stride) {
+  if (tx_type == DCT_DCT)
+    vp9_fdct4x4(input, output, stride);
+  else
+    vp9_short_fht4x4(input, output, stride, tx_type);
+}
+
+void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+                int stride) {
+  if (tx_type == DCT_DCT)
+    vp9_fdct8x8(input, output, stride);
+  else
+    vp9_short_fht8x8(input, output, stride, tx_type);
+}
+
+void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+                  int stride) {
+  if (tx_type == DCT_DCT)
+    vp9_fdct16x16(input, output, stride);
+  else
+    vp9_short_fht16x16(input, output, stride, tx_type);
+}
diff --git a/libvpx/vp9/encoder/vp9_dct.h b/libvpx/vp9/encoder/vp9_dct.h
new file mode 100644
index 0000000..aaf976d
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_dct.h
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_DCT_H_
+#define VP9_ENCODER_VP9_DCT_H_
+
+void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+                int stride);
+
+void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+                int stride);
+
+void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+                  int stride);
+
+#endif  // VP9_ENCODER_VP9_DCT_H_
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index 44ab02d..a45299b 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -22,6 +22,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_extend.h"
 #include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_quant_common.h"
@@ -29,7 +30,6 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_tile_common.h"
-
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/encoder/vp9_encodemb.h"
@@ -37,24 +37,44 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_tokenize.h"
+#include "vp9/encoder/vp9_vaq.h"
 
-#define DBG_PRNT_SEGMAP 0
 
+#define DBG_PRNT_SEGMAP 0
 
-static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
-  TX_4X4,  // ONLY_4X4
-  TX_8X8,  // ONLY_8X8
-  TX_16X16,  // ONLY_16X16
-  TX_32X32,  // ONLY_32X32
-  TX_32X32,  // TX_MODE_SELECT
-};
 
 // #define ENC_DEBUG
 #ifdef ENC_DEBUG
 int enc_debug = 0;
 #endif
 
+static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
+  switch (subsize) {
+    case BLOCK_64X64:
+    case BLOCK_64X32:
+    case BLOCK_32X64:
+    case BLOCK_32X32:
+      return &xd->sb_index;
+    case BLOCK_32X16:
+    case BLOCK_16X32:
+    case BLOCK_16X16:
+      return &xd->mb_index;
+    case BLOCK_16X8:
+    case BLOCK_8X16:
+    case BLOCK_8X8:
+      return &xd->b_index;
+    case BLOCK_8X4:
+    case BLOCK_4X8:
+    case BLOCK_4X4:
+      return &xd->ab_index;
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize);
 
@@ -122,7 +142,6 @@ static unsigned int tt_activity_measure(MACROBLOCK *x) {
 static unsigned int alt_activity_measure(MACROBLOCK *x, int use_dc_pred) {
   return vp9_encode_intra(x, use_dc_pred);
 }
-DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = {0};
 
 // Measure the activity of the current macroblock
 // What we measure here is TBD so abstracted to this function
@@ -173,8 +192,9 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
           tmp = sortlist[j - 1];
           sortlist[j - 1] = sortlist[j];
           sortlist[j] = tmp;
-        } else
-        break;
+        } else {
+          break;
+        }
       }
     }
 
@@ -246,13 +266,11 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
 #if OUTPUT_NORM_ACT_STATS
     fprintf(f, "\n");
 #endif
-
   }
 
 #if OUTPUT_NORM_ACT_STATS
   fclose(f);
 #endif
-
 }
 #endif  // USE_ACT_INDEX
 
@@ -264,7 +282,7 @@ static void build_activity_map(VP9_COMP *cpi) {
   VP9_COMMON * const cm = &cpi->common;
 
 #if ALT_ACT_MEASURE
-  YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+  YV12_BUFFER_CONFIG *new_yv12 = get_frame_new_buffer(cm);
   int recon_yoffset;
   int recon_y_stride = new_yv12->y_stride;
 #endif
@@ -317,7 +335,6 @@ static void build_activity_map(VP9_COMP *cpi) {
   // Calculate an activity index number of each mb
   calc_activity_index(cpi, x);
 #endif
-
 }
 
 // Macroblock activity masking
@@ -351,8 +368,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO * const mbmi = &xd->this_mi->mbmi;
-  MODE_INFO *mi_addr = xd->this_mi;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MODE_INFO *mi_addr = xd->mi_8x8[0];
 
   int mb_mode_index = ctx->best_mode_index;
   const int mis = cm->mode_info_stride;
@@ -360,7 +377,6 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
 
   assert(mi->mbmi.mode < MB_MODE_COUNT);
-  assert(mb_mode_index < MAX_MODES);
   assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);
   assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
   assert(mi->mbmi.sb_type == bsize);
@@ -375,6 +391,10 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
           && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y)
         xd->mi_8x8[x_idx + y * mis] = mi_addr;
 
+  if (cpi->sf.variance_adaptive_quantization) {
+    vp9_mb_init_quantizer(cpi, x);
+  }
+
   // FIXME(rbultje) I'm pretty sure this should go to the end of this block
   // (i.e. after the output_enabled)
   if (bsize < BLOCK_32X32) {
@@ -384,12 +404,14 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   }
 
   if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
-    *x->partition_info = ctx->partition_info;
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   }
 
   x->skip = ctx->skip;
+  vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
+             sizeof(uint8_t) * ctx->num_4x4_blk);
+
   if (!output_enabled)
     return;
 
@@ -398,15 +420,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
       cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i];
   }
 
-  if (cm->frame_type == KEY_FRAME) {
-    // Restore the coding modes to that held in the coding context
-    // if (mb_mode == I4X4_PRED)
-    //    for (i = 0; i < 16; i++)
-    //    {
-    //        xd->block[i].bmi.as_mode =
-    //                          xd->mode_info_context->bmi[i].as_mode;
-    //        assert(xd->mode_info_context->bmi[i].as_mode < MB_MODE_COUNT);
-    //    }
+  if (frame_is_intra_only(cm)) {
 #if CONFIG_INTERNAL_STATS
     static const int kf_mode_index[] = {
       THR_DC /*DC_PRED*/,
@@ -419,7 +433,6 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
       THR_D207_PRED /*D207_PRED*/,
       THR_D63_PRED /*D63_PRED*/,
       THR_TM /*TM_PRED*/,
-      THR_B_PRED /*I4X4_PRED*/,
     };
     cpi->mode_chosen_counts[kf_mode_index[mi->mbmi.mode]]++;
 #endif
@@ -428,18 +441,19 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     cpi->mode_chosen_counts[mb_mode_index]++;
     if (is_inter_block(mbmi)
         && (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
-      int_mv best_mv, best_second_mv;
+      int_mv best_mv[2];
       const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
-      best_mv.as_int = ctx->best_ref_mv.as_int;
-      best_second_mv.as_int = ctx->second_best_ref_mv.as_int;
+      best_mv[0].as_int = ctx->best_ref_mv.as_int;
+      best_mv[1].as_int = ctx->second_best_ref_mv.as_int;
       if (mbmi->mode == NEWMV) {
-        best_mv.as_int = mbmi->ref_mvs[rf1][0].as_int;
-        best_second_mv.as_int = mbmi->ref_mvs[rf2][0].as_int;
+        best_mv[0].as_int = mbmi->ref_mvs[rf1][0].as_int;
+        if (rf2 > 0)
+          best_mv[1].as_int = mbmi->ref_mvs[rf2][0].as_int;
       }
-      mbmi->best_mv.as_int = best_mv.as_int;
-      mbmi->best_second_mv.as_int = best_second_mv.as_int;
-      vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv);
+      mbmi->best_mv[0].as_int = best_mv[0].as_int;
+      mbmi->best_mv[1].as_int = best_mv[1].as_int;
+      vp9_update_mv_count(cpi, x, best_mv);
     }
 
     if (cm->mcomp_filter_type == SWITCHABLE && is_inter_mode(mbmi->mode)) {
@@ -451,28 +465,27 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
     cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
 
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
       cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
   }
 }
 
 void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
-                          int mb_row, int mb_col) {
-  uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, src
-      ->alpha_buffer};
-  int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, src
-      ->alpha_stride};
+                          int mi_row, int mi_col) {
+  uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                               src->alpha_buffer};
+  const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                          src->alpha_stride};
   int i;
 
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mb_row, mb_col,
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mi_row, mi_col,
                      NULL, x->e_mbd.plane[i].subsampling_x,
                      x->e_mbd.plane[i].subsampling_y);
-  }
 }
 
-static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
-                        BLOCK_SIZE bsize) {
+static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
+                        int mi_row, int mi_col, BLOCK_SIZE bsize) {
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -486,16 +499,12 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
   const int idx_map = mb_row * cm->mb_cols + mb_col;
   const struct segmentation *const seg = &cm->seg;
 
-  set_skip_context(cm, xd, mi_row, mi_col);
-  set_partition_seg_context(cm, xd, mi_row, mi_col);
+  set_skip_context(xd, cpi->above_context, cpi->left_context, mi_row, mi_col);
 
   // Activity map pointer
   x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
   x->active_ptr = cpi->active_map + idx_map;
 
-  /* pointers to mode info contexts */
-  x->partition_info = x->pi + idx_str;
-
   xd->mi_8x8 = cm->mi_grid_visible + idx_str;
   xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
 
@@ -503,10 +512,9 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
   // cannot be used.
   xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
 
-  xd->this_mi =
   xd->mi_8x8[0] = cm->mi + idx_str;
 
-  mbmi = &xd->this_mi->mbmi;
+  mbmi = &xd->mi_8x8[0]->mbmi;
 
   // Set up destination pointers
   setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col);
@@ -520,7 +528,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
 
   // Set up distance of MB to edge of frame in 1/8th pel units
   assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
-  set_mi_row_col(cm, xd, mi_row, mi_height, mi_col, mi_width);
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_rows, cm->mi_cols);
 
   /* set up source buffers */
   vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
@@ -531,10 +540,11 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
 
   /* segment ID */
   if (seg->enabled) {
-    uint8_t *map = seg->update_map ? cpi->segmentation_map
-                                   : cm->last_frame_seg_map;
-    mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
-
+    if (!cpi->sf.variance_adaptive_quantization) {
+      uint8_t *map = seg->update_map ? cpi->segmentation_map
+          : cm->last_frame_seg_map;
+      mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
     vp9_mb_init_quantizer(cpi, x);
 
     if (seg->enabled && cpi->seg0_cnt > 0
@@ -546,9 +556,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
       const int x = mb_col & ~3;
       const int p16 = ((mb_row & 1) << 1) + (mb_col & 1);
       const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
-      const int tile_progress = cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
-      const int mb_cols = (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start)
-          >> 1;
+      const int tile_progress = tile->mi_col_start * cm->mb_rows >> 1;
+      const int mb_cols = (tile->mi_col_end - tile->mi_col_start) >> 1;
 
       cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress)
           << 16) / cm->MBs;
@@ -561,13 +570,19 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
   }
 }
 
-static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
+static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
+                          int mi_row, int mi_col,
                           int *totalrate, int64_t *totaldist,
                           BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                           int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  int orig_rdmult = x->rdmult;
+  double rdmult_ratio;
+
+  vp9_clear_system_state();  // __asm emms;
+  rdmult_ratio = 1.0;  // avoid uninitialized warnings
 
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
   x->use_lp32x32fdct = 1;
@@ -582,35 +597,66 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
     }
   }
 
-  set_offsets(cpi, mi_row, mi_col, bsize);
-  xd->this_mi->mbmi.sb_type = bsize;
+  set_offsets(cpi, tile, mi_row, mi_col, bsize);
+  xd->mi_8x8[0]->mbmi.sb_type = bsize;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
-  xd->this_mi->mbmi.skip_coeff = 0;
+  xd->mi_8x8[0]->mbmi.skip_coeff = 0;
 
   x->source_variance = get_sby_perpixel_variance(cpi, x, bsize);
 
+  if (cpi->sf.variance_adaptive_quantization) {
+    int energy;
+    if (bsize <= BLOCK_16X16) {
+      energy = x->mb_energy;
+    } else {
+      energy = vp9_block_energy(cpi, x, bsize);
+    }
+
+    xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy);
+    rdmult_ratio = vp9_vaq_rdmult_ratio(energy);
+    vp9_mb_init_quantizer(cpi, x);
+  }
+
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
     vp9_activity_masking(cpi, x);
 
+  if (cpi->sf.variance_adaptive_quantization) {
+    vp9_clear_system_state();  // __asm emms;
+    x->rdmult = round(x->rdmult * rdmult_ratio);
+  }
+
   // Find best coding mode & reconstruct the MB so it is available
   // as a predictor for MBs that follow in the SB
-  if (cm->frame_type == KEY_FRAME)
+  if (frame_is_intra_only(cm)) {
     vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx,
                               best_rd);
-  else
-    vp9_rd_pick_inter_mode_sb(cpi, x, mi_row, mi_col, totalrate, totaldist,
-                              bsize, ctx, best_rd);
+  } else {
+    if (bsize >= BLOCK_8X8)
+      vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col,
+                                totalrate, totaldist, bsize, ctx, best_rd);
+    else
+      vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, totalrate,
+                                    totaldist, bsize, ctx, best_rd);
+  }
+
+  if (cpi->sf.variance_adaptive_quantization) {
+    x->rdmult = orig_rdmult;
+    if (*totalrate != INT_MAX) {
+      vp9_clear_system_state();  // __asm emms;
+      *totalrate = round(*totalrate * rdmult_ratio);
+    }
+  }
 }
 
 static void update_stats(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi = xd->this_mi;
+  MODE_INFO *mi = xd->mi_8x8[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-  if (cm->frame_type != KEY_FRAME) {
+  if (!frame_is_intra_only(cm)) {
     const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                                      SEG_LVL_REF_FRAME);
 
@@ -637,49 +683,6 @@ static void update_stats(VP9_COMP *cpi) {
                                [mbmi->ref_frame[0] != GOLDEN_FRAME]++;
       }
     }
-
-    // Count of last ref frame 0,0 usage
-    if (mbmi->mode == ZEROMV && mbmi->ref_frame[0] == LAST_FRAME)
-      cpi->inter_zz_count++;
-  }
-}
-
-// TODO(jingning): the variables used here are little complicated. need further
-// refactoring on organizing the temporary buffers, when recursive
-// partition down to 4x4 block size is enabled.
-static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  switch (bsize) {
-    case BLOCK_64X64:
-      return &x->sb64_context;
-    case BLOCK_64X32:
-      return &x->sb64x32_context[xd->sb_index];
-    case BLOCK_32X64:
-      return &x->sb32x64_context[xd->sb_index];
-    case BLOCK_32X32:
-      return &x->sb32_context[xd->sb_index];
-    case BLOCK_32X16:
-      return &x->sb32x16_context[xd->sb_index][xd->mb_index];
-    case BLOCK_16X32:
-      return &x->sb16x32_context[xd->sb_index][xd->mb_index];
-    case BLOCK_16X16:
-      return &x->mb_context[xd->sb_index][xd->mb_index];
-    case BLOCK_16X8:
-      return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_8X16:
-      return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_8X8:
-      return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_8X4:
-      return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_4X8:
-      return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_4X4:
-      return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
-    default:
-      assert(0);
-      return NULL ;
   }
 }
 
@@ -696,7 +699,7 @@ static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) {
       return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
     default:
       assert(0);
-      return NULL ;
+      return NULL;
   }
 }
 
@@ -705,7 +708,6 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
                             ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                             PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
                             BLOCK_SIZE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int p;
@@ -715,28 +717,27 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
   int mi_height = num_8x8_blocks_high_lookup[bsize];
   for (p = 0; p < MAX_MB_PLANE; p++) {
     vpx_memcpy(
-        cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+        cpi->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
         a + num_4x4_blocks_wide * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
     vpx_memcpy(
-        cm->left_context[p]
+        cpi->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
         l + num_4x4_blocks_high * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  vpx_memcpy(cm->above_seg_context + mi_col, sa,
-             sizeof(PARTITION_CONTEXT) * mi_width);
-  vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
-             sizeof(PARTITION_CONTEXT) * mi_height);
+  vpx_memcpy(cpi->above_seg_context + mi_col, sa,
+             sizeof(*cpi->above_seg_context) * mi_width);
+  vpx_memcpy(cpi->left_seg_context + (mi_row & MI_MASK), sl,
+             sizeof(cpi->left_seg_context[0]) * mi_height);
 }
 static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                          PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
                          BLOCK_SIZE bsize) {
-  const VP9_COMMON *const cm = &cpi->common;
   const MACROBLOCK *const x = &cpi->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   int p;
@@ -749,23 +750,24 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
   for (p = 0; p < MAX_MB_PLANE; ++p) {
     vpx_memcpy(
         a + num_4x4_blocks_wide * p,
-        cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+        cpi->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
     vpx_memcpy(
         l + num_4x4_blocks_high * p,
-        cm->left_context[p]
+        cpi->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  vpx_memcpy(sa, cm->above_seg_context + mi_col,
-             sizeof(PARTITION_CONTEXT) * mi_width);
-  vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
-             sizeof(PARTITION_CONTEXT) * mi_height);
+  vpx_memcpy(sa, cpi->above_seg_context + mi_col,
+             sizeof(*cpi->above_seg_context) * mi_width);
+  vpx_memcpy(sl, cpi->left_seg_context + (mi_row & MI_MASK),
+             sizeof(cpi->left_seg_context[0]) * mi_height);
 }
 
-static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
+                     TOKENEXTRA **tp, int mi_row, int mi_col,
                      int output_enabled, BLOCK_SIZE bsize, int sub_index) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
@@ -783,7 +785,7 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
     if (xd->ab_index > 0)
       return;
   }
-  set_offsets(cpi, mi_row, mi_col, bsize);
+  set_offsets(cpi, tile, mi_row, mi_col, bsize);
   update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);
   encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
 
@@ -795,7 +797,8 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
   }
 }
 
-static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
+                      TOKENEXTRA **tp, int mi_row, int mi_col,
                       int output_enabled, BLOCK_SIZE bsize) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
@@ -812,8 +815,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
 
   c1 = BLOCK_4X4;
   if (bsize >= BLOCK_8X8) {
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    pl = partition_plane_context(xd, bsize);
+    pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+                                 mi_row, mi_col, bsize);
     c1 = *(get_sb_partitioning(x, bsize));
   }
   partition = partition_lookup[bsl][c1];
@@ -822,19 +825,19 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
     case PARTITION_NONE:
       if (output_enabled && bsize >= BLOCK_8X8)
         cpi->partition_count[pl][PARTITION_NONE]++;
-      encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
+      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, -1);
       break;
     case PARTITION_VERT:
       if (output_enabled)
         cpi->partition_count[pl][PARTITION_VERT]++;
-      encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
-      encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
+      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0);
+      encode_b(cpi, tile, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
       break;
     case PARTITION_HORZ:
       if (output_enabled)
         cpi->partition_count[pl][PARTITION_HORZ]++;
-      encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
-      encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
+      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0);
+      encode_b(cpi, tile, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
       break;
     case PARTITION_SPLIT:
       subsize = get_subsize(bsize, PARTITION_SPLIT);
@@ -846,7 +849,7 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
         const int x_idx = i & 1, y_idx = i >> 1;
 
         *get_sb_index(xd, subsize) = i;
-        encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
+        encode_sb(cpi, tile, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
                   output_enabled, subsize);
       }
       break;
@@ -855,10 +858,9 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
       break;
   }
 
-  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) {
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    update_partition_context(xd, c1, bsize);
-  }
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+    update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
+                             mi_row, mi_col, c1, bsize);
 }
 
 // Check to see if the given partition size is allowed for a specified number
@@ -886,13 +888,13 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
 // However, at the bottom and right borders of the image the requested size
 // may not be allowed in which case this code attempts to choose the largest
 // allowable partition.
-static void set_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
-                             int mi_row, int mi_col) {
+static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
+                             MODE_INFO **mi_8x8, int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
   const int mis = cm->mode_info_stride;
-  int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row;
-  int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col;
+  int row8x8_remaining = tile->mi_row_end - mi_row;
+  int col8x8_remaining = tile->mi_col_end - mi_col;
   int block_row, block_col;
   MODE_INFO * mi_upper_left = cm->mi + mi_row * mis + mi_col;
   int bh = num_8x8_blocks_high_lookup[bsize];
@@ -936,7 +938,7 @@ static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
     for (block_col = 0; block_col < 8; ++block_col) {
       MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col];
       BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
-      int offset;
+      ptrdiff_t offset;
 
       if (prev_mi) {
         offset = prev_mi - cm->prev_mi;
@@ -947,324 +949,29 @@ static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
   }
 }
 
-static void set_block_size(VP9_COMMON * const cm, MODE_INFO **mi_8x8,
-                           BLOCK_SIZE bsize, int mis, int mi_row,
-                           int mi_col) {
-  int r, c;
-  const int bs = MAX(num_8x8_blocks_wide_lookup[bsize],
-                     num_8x8_blocks_high_lookup[bsize]);
-  const int idx_str = mis * mi_row + mi_col;
-  MODE_INFO **const mi2 = &mi_8x8[idx_str];
-
-  mi2[0] = cm->mi + idx_str;
-  mi2[0]->mbmi.sb_type = bsize;
-
-  for (r = 0; r < bs; r++)
-    for (c = 0; c < bs; c++)
-      if (mi_row + r < cm->mi_rows && mi_col + c < cm->mi_cols)
-        mi2[r * mis + c] = mi2[0];
-}
-
-typedef struct {
-  int64_t sum_square_error;
-  int64_t sum_error;
-  int count;
-  int variance;
-} var;
-
-typedef struct {
-  var none;
-  var horz[2];
-  var vert[2];
-} partition_variance;
-
-#define VT(TYPE, BLOCKSIZE) \
-  typedef struct { \
-    partition_variance vt; \
-    BLOCKSIZE split[4]; } TYPE;
-
-VT(v8x8, var)
-VT(v16x16, v8x8)
-VT(v32x32, v16x16)
-VT(v64x64, v32x32)
-
-typedef struct {
-  partition_variance *vt;
-  var *split[4];
-} vt_node;
-
-typedef enum {
-  V16X16,
-  V32X32,
-  V64X64,
-} TREE_LEVEL;
-
-static void tree_to_node(void *data, BLOCK_SIZE bsize, vt_node *node) {
-  int i;
-  switch (bsize) {
-    case BLOCK_64X64: {
-      v64x64 *vt = (v64x64 *) data;
-      node->vt = &vt->vt;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].vt.none;
-      break;
-    }
-    case BLOCK_32X32: {
-      v32x32 *vt = (v32x32 *) data;
-      node->vt = &vt->vt;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].vt.none;
-      break;
-    }
-    case BLOCK_16X16: {
-      v16x16 *vt = (v16x16 *) data;
-      node->vt = &vt->vt;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].vt.none;
-      break;
-    }
-    case BLOCK_8X8: {
-      v8x8 *vt = (v8x8 *) data;
-      node->vt = &vt->vt;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i];
-      break;
-    }
-    default:
-      node->vt = 0;
-      for (i = 0; i < 4; i++)
-        node->split[i] = 0;
-      assert(-1);
-  }
-}
-
-// Set variance values given sum square error, sum error, count.
-static void fill_variance(var *v, int64_t s2, int64_t s, int c) {
-  v->sum_square_error = s2;
-  v->sum_error = s;
-  v->count = c;
-  if (c > 0)
-    v->variance = 256
-        * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
-        / v->count;
-  else
-    v->variance = 0;
-}
-
-// Combine 2 variance structures by summing the sum_error, sum_square_error,
-// and counts and then calculating the new variance.
-void sum_2_variances(var *r, var *a, var*b) {
-  fill_variance(r, a->sum_square_error + b->sum_square_error,
-                a->sum_error + b->sum_error, a->count + b->count);
-}
-
-static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
-  vt_node node;
-  tree_to_node(data, bsize, &node);
-  sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]);
-  sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]);
-  sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]);
-  sum_2_variances(&node.vt->vert[1], node.split[1], node.split[3]);
-  sum_2_variances(&node.vt->none, &node.vt->vert[0], &node.vt->vert[1]);
-}
-
-#if PERFORM_RANDOM_PARTITIONING
-static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
-    BLOCK_SIZE block_size, int mi_row,
-    int mi_col, int mi_size) {
-  VP9_COMMON * const cm = &cpi->common;
-  vt_node vt;
-  const int mis = cm->mode_info_stride;
-  int64_t threshold = 4 * cpi->common.base_qindex * cpi->common.base_qindex;
-
-  tree_to_node(data, block_size, &vt);
-
-  // split none is available only if we have more than half a block size
-  // in width and height inside the visible image
-  if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows &&
-      (rand() & 3) < 1) {
-    set_block_size(cm, m, block_size, mis, mi_row, mi_col);
-    return 1;
-  }
-
-  // vertical split is available on all but the bottom border
-  if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
-      && (rand() & 3) < 1) {
-    set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
-        mi_col);
-    return 1;
-  }
-
-  // horizontal split is available on all but the right border
-  if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
-      && (rand() & 3) < 1) {
-    set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
-        mi_col);
-    return 1;
-  }
-
-  return 0;
-}
-
-#else  // !PERFORM_RANDOM_PARTITIONING
-
-static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO **m,
-                               BLOCK_SIZE bsize, int mi_row,
-                               int mi_col, int mi_size) {
-  VP9_COMMON * const cm = &cpi->common;
-  vt_node vt;
-  const int mis = cm->mode_info_stride;
-  int64_t threshold = 50 * cpi->common.base_qindex;
-
-  tree_to_node(data, bsize, &vt);
-
-  // split none is available only if we have more than half a block size
-  // in width and height inside the visible image
-  if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows
-      && vt.vt->none.variance < threshold) {
-    set_block_size(cm, m, bsize, mis, mi_row, mi_col);
-    return 1;
-  }
-
-  // vertical split is available on all but the bottom border
-  if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
-      && vt.vt->vert[1].variance < threshold) {
-    set_block_size(cm, m, get_subsize(bsize, PARTITION_VERT), mis, mi_row,
-                   mi_col);
-    return 1;
-  }
-
-  // horizontal split is available on all but the right border
-  if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
-      && vt.vt->horz[1].variance < threshold) {
-    set_block_size(cm, m, get_subsize(bsize, PARTITION_HORZ), mis, mi_row,
-                   mi_col);
-    return 1;
-  }
-
-  return 0;
-}
-#endif  // PERFORM_RANDOM_PARTITIONING
-
-static void choose_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
-                                int mi_row, int mi_col) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK *x = &cpi->mb;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+static int sb_has_motion(VP9_COMP *cpi, MODE_INFO **prev_mi_8x8) {
+  VP9_COMMON *const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
-  // TODO(JBB): More experimentation or testing of this threshold;
-  int64_t threshold = 4;
-  int i, j, k;
-  v64x64 vt;
-  unsigned char * s;
-  int sp;
-  const unsigned char * d;
-  int dp;
-  int pixels_wide = 64, pixels_high = 64;
-
-  vp9_zero(vt);
-  set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
-
-  if (xd->mb_to_right_edge < 0)
-    pixels_wide += (xd->mb_to_right_edge >> 3);
-
-  if (xd->mb_to_bottom_edge < 0)
-    pixels_high += (xd->mb_to_bottom_edge >> 3);
-
-  s = x->plane[0].src.buf;
-  sp = x->plane[0].src.stride;
-
-  // TODO(JBB): Clearly the higher the quantizer the fewer partitions we want
-  // but this needs more experimentation.
-  threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex;
-
-  d = vp9_64x64_zeros;
-  dp = 64;
-  if (cm->frame_type != KEY_FRAME) {
-    int_mv nearest_mv, near_mv;
-    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, LAST_FRAME)];
-    YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];
-    YV12_BUFFER_CONFIG *second_ref_fb = NULL;
-
-    setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col,
-                     &xd->scale_factor[0]);
-    setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
-                     &xd->scale_factor[1]);
+  int block_row, block_col;
 
-    xd->this_mi->mbmi.ref_frame[0] = LAST_FRAME;
-    xd->this_mi->mbmi.sb_type = BLOCK_64X64;
-    vp9_find_best_ref_mvs(xd,
-                          mi_8x8[0]->mbmi.ref_mvs[mi_8x8[0]->mbmi.ref_frame[0]],
-                          &nearest_mv, &near_mv);
-
-    xd->this_mi->mbmi.mv[0] = nearest_mv;
-    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64);
-
-    d = xd->plane[0].dst.buf;
-    dp = xd->plane[0].dst.stride;
-  }
-
-  // Fill in the entire tree of 8x8 variances for splits.
-  for (i = 0; i < 4; i++) {
-    const int x32_idx = ((i & 1) << 5);
-    const int y32_idx = ((i >> 1) << 5);
-    for (j = 0; j < 4; j++) {
-      const int x16_idx = x32_idx + ((j & 1) << 4);
-      const int y16_idx = y32_idx + ((j >> 1) << 4);
-      v16x16 *vst = &vt.split[i].split[j];
-      for (k = 0; k < 4; k++) {
-        int x_idx = x16_idx + ((k & 1) << 3);
-        int y_idx = y16_idx + ((k >> 1) << 3);
-        unsigned int sse = 0;
-        int sum = 0;
-        if (x_idx < pixels_wide && y_idx < pixels_high)
-          vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
-                              d + y_idx * dp + x_idx, dp, &sse, &sum);
-        fill_variance(&vst->split[k].vt.none, sse, sum, 64);
-      }
-    }
-  }
-  // Fill the rest of the variance tree by summing the split partition
-  // values.
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
-    }
-    fill_variance_tree(&vt.split[i], BLOCK_32X32);
-  }
-  fill_variance_tree(&vt, BLOCK_64X64);
-  // Now go through the entire structure,  splitting every block size until
-  // we get to one that's got a variance lower than our threshold,  or we
-  // hit 8x8.
-  if (!set_vt_partitioning(cpi, &vt, mi_8x8, BLOCK_64X64, mi_row, mi_col,
-                           4)) {
-    for (i = 0; i < 4; ++i) {
-      const int x32_idx = ((i & 1) << 2);
-      const int y32_idx = ((i >> 1) << 2);
-      if (!set_vt_partitioning(cpi, &vt.split[i], mi_8x8, BLOCK_32X32,
-                               (mi_row + y32_idx), (mi_col + x32_idx), 2)) {
-        for (j = 0; j < 4; ++j) {
-          const int x16_idx = ((j & 1) << 1);
-          const int y16_idx = ((j >> 1) << 1);
-          if (!set_vt_partitioning(cpi, &vt.split[i].split[j], mi_8x8,
-                                   BLOCK_16X16,
-                                   (mi_row + y32_idx + y16_idx),
-                                   (mi_col + x32_idx + x16_idx), 1)) {
-            for (k = 0; k < 4; ++k) {
-              const int x8_idx = (k & 1);
-              const int y8_idx = (k >> 1);
-              set_block_size(cm, mi_8x8, BLOCK_8X8, mis,
-                             (mi_row + y32_idx + y16_idx + y8_idx),
-                             (mi_col + x32_idx + x16_idx + x8_idx));
-            }
-          }
+  if (cm->prev_mi) {
+    for (block_row = 0; block_row < 8; ++block_row) {
+      for (block_col = 0; block_col < 8; ++block_col) {
+        MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col];
+        if (prev_mi) {
+          if (abs(prev_mi->mbmi.mv[0].as_mv.row) >= 8 ||
+              abs(prev_mi->mbmi.mv[0].as_mv.col) >= 8)
+            return 1;
         }
       }
     }
   }
+  return 0;
 }
 
-static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+static void rd_use_partition(VP9_COMP *cpi,
+                             const TileInfo *const tile,
+                             MODE_INFO **mi_8x8,
                              TOKENEXTRA **tp, int mi_row, int mi_col,
                              BLOCK_SIZE bsize, int *rate, int64_t *dist,
                              int do_recon) {
@@ -1315,6 +1022,11 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
   }
   save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
+  if (bsize == BLOCK_16X16) {
+    set_offsets(cpi, tile, mi_row, mi_col, bsize);
+    x->mb_energy = vp9_block_energy(cpi, x, bsize);
+  }
+
   x->fast_ms = 0;
   x->subblock_ref = 0;
 
@@ -1338,11 +1050,12 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
         mi_row + (ms >> 1) < cm->mi_rows &&
         mi_col + (ms >> 1) < cm->mi_cols) {
       *(get_sb_partitioning(x, bsize)) = bsize;
-      pick_sb_modes(cpi, mi_row, mi_col, &none_rate, &none_dist, bsize,
+      pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
                     get_block_context(x, bsize), INT64_MAX);
 
-      set_partition_seg_context(cm, xd, mi_row, mi_col);
-      pl = partition_plane_context(xd, bsize);
+      pl = partition_plane_context(cpi->above_seg_context,
+                                   cpi->left_seg_context,
+                                   mi_row, mi_col, bsize);
       none_rate += x->partition_cost[pl][PARTITION_NONE];
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1353,12 +1066,12 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
 
   switch (partition) {
     case PARTITION_NONE:
-      pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     bsize, get_block_context(x, bsize), INT64_MAX);
       break;
     case PARTITION_HORZ:
       *get_sb_index(xd, subsize) = 0;
-      pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     subsize, get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
@@ -1367,7 +1080,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(xd, subsize) = 1;
-        pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
+        pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
                       get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
           last_part_rate = INT_MAX;
@@ -1381,7 +1094,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
       break;
     case PARTITION_VERT:
       *get_sb_index(xd, subsize) = 0;
-      pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     subsize, get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
@@ -1390,7 +1103,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(xd, subsize) = 1;
-        pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
+        pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
                       get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
           last_part_rate = INT_MAX;
@@ -1417,7 +1130,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
 
         *get_sb_index(xd, subsize) = i;
 
-        rd_use_partition(cpi, mi_8x8 + jj * bss * mis + ii * bss, tp,
+        rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
                          i != 3);
         if (rt == INT_MAX || dt == INT_MAX) {
@@ -1432,8 +1145,9 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
     default:
       assert(0);
   }
-  set_partition_seg_context(cm, xd, mi_row, mi_col);
-  pl = partition_plane_context(xd, bsize);
+
+  pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+                               mi_row, mi_col, bsize);
   if (last_part_rate < INT_MAX)
     last_part_rate += x->partition_cost[pl][partition];
 
@@ -1465,7 +1179,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
 
       save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-      pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
+      pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
                     split_subsize, get_block_context(x, split_subsize),
                     INT64_MAX);
 
@@ -1478,17 +1192,18 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
       }
 
       if (i != 3)
-        encode_sb(cpi, tp,  mi_row + y_idx, mi_col + x_idx, 0,
+        encode_sb(cpi, tile, tp,  mi_row + y_idx, mi_col + x_idx, 0,
                   split_subsize);
 
       split_rate += rt;
       split_dist += dt;
-      set_partition_seg_context(cm, xd, mi_row + y_idx, mi_col + x_idx);
-      pl = partition_plane_context(xd, bsize);
+      pl = partition_plane_context(cpi->above_seg_context,
+                                   cpi->left_seg_context,
+                                   mi_row + y_idx, mi_col + x_idx, bsize);
       split_rate += x->partition_cost[pl][PARTITION_NONE];
     }
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    pl = partition_plane_context(xd, bsize);
+    pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+                                 mi_row, mi_col, bsize);
     if (split_rate < INT_MAX) {
       split_rate += x->partition_cost[pl][PARTITION_SPLIT];
 
@@ -1523,7 +1238,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
     assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
 
   if (do_recon)
-    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+    encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
 
   *rate = chosen_rate;
   *dist = chosen_dist;
@@ -1571,66 +1286,68 @@ static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8,
 
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
-static void rd_auto_partition_range(VP9_COMP *cpi, int row, int col,
+static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
+                                    int row, int col,
                                     BLOCK_SIZE *min_block_size,
                                     BLOCK_SIZE *max_block_size) {
+  VP9_COMMON * const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   MODE_INFO ** mi_8x8 = xd->mi_8x8;
+  MODE_INFO ** prev_mi_8x8 = xd->prev_mi_8x8;
+
   const int left_in_image = xd->left_available && mi_8x8[-1];
   const int above_in_image = xd->up_available &&
                              mi_8x8[-xd->mode_info_stride];
   MODE_INFO ** above_sb64_mi_8x8;
   MODE_INFO ** left_sb64_mi_8x8;
 
-  // Frequency check
-  if (cpi->sf.auto_min_max_partition_count <= 0) {
-    cpi->sf.auto_min_max_partition_count =
-      cpi->sf.auto_min_max_partition_interval;
+  int row8x8_remaining = tile->mi_row_end - row;
+  int col8x8_remaining = tile->mi_col_end - col;
+  int bh, bw;
+
+  // Trap case where we do not have a prediction.
+  if (!left_in_image && !above_in_image &&
+      ((cm->frame_type == KEY_FRAME) || !cm->prev_mi)) {
     *min_block_size = BLOCK_4X4;
     *max_block_size = BLOCK_64X64;
   } else {
-    --cpi->sf.auto_min_max_partition_count;
-
-    // Set default values if no left or above neighbour
-    if (!left_in_image && !above_in_image) {
-      *min_block_size = BLOCK_4X4;
-      *max_block_size = BLOCK_64X64;
-    } else {
-      VP9_COMMON *const cm = &cpi->common;
-      int row8x8_remaining = cm->cur_tile_mi_row_end - row;
-      int col8x8_remaining = cm->cur_tile_mi_col_end - col;
-      int bh, bw;
-
-      // Default "min to max" and "max to min"
-      *min_block_size = BLOCK_64X64;
-      *max_block_size = BLOCK_4X4;
-
-      // Find the min and max partition sizes used in the left SB64
-      if (left_in_image) {
-        left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE];
-        get_sb_partition_size_range(cpi, left_sb64_mi_8x8,
-                                    min_block_size, max_block_size);
-      }
-
-      // Find the min and max partition sizes used in the above SB64 taking
-      // the values found for left as a starting point.
-      if (above_in_image) {
-        above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE];
-        get_sb_partition_size_range(cpi, above_sb64_mi_8x8,
-                                    min_block_size, max_block_size);
-      }
+    // Default "min to max" and "max to min"
+    *min_block_size = BLOCK_64X64;
+    *max_block_size = BLOCK_4X4;
+
+    // NOTE: each call to get_sb_partition_size_range() uses the previous
+    // passed in values for min and max as a starting point.
+    //
+    // Find the min and max partition used in previous frame at this location
+    if (cm->prev_mi && (cm->frame_type != KEY_FRAME)) {
+      get_sb_partition_size_range(cpi, prev_mi_8x8,
+                                  min_block_size, max_block_size);
+    }
 
-      // Give a bit of leaway either side of the observed min and max
-      *min_block_size = min_partition_size[*min_block_size];
-      *max_block_size = max_partition_size[*max_block_size];
+    // Find the min and max partition sizes used in the left SB64
+    if (left_in_image) {
+      left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE];
+      get_sb_partition_size_range(cpi, left_sb64_mi_8x8,
+                                  min_block_size, max_block_size);
+    }
 
-      // Check border cases where max and min from neighbours may not be legal.
-      *max_block_size = find_partition_size(*max_block_size,
-                                            row8x8_remaining, col8x8_remaining,
-                                            &bh, &bw);
-      *min_block_size = MIN(*min_block_size, *max_block_size);
+    // Find the min and max partition sizes used in the above SB64.
+    if (above_in_image) {
+      above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE];
+      get_sb_partition_size_range(cpi, above_sb64_mi_8x8,
+                                  min_block_size, max_block_size);
     }
   }
+
+  // Give a bit of leaway either side of the observed min and max
+  *min_block_size = min_partition_size[*min_block_size];
+  *max_block_size = max_partition_size[*max_block_size];
+
+  // Check border cases where max and min from neighbours may not be legal.
+  *max_block_size = find_partition_size(*max_block_size,
+                                        row8x8_remaining, col8x8_remaining,
+                                        &bh, &bw);
+  *min_block_size = MIN(*min_block_size, *max_block_size);
 }
 
 static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
@@ -1735,7 +1452,8 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
+static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
+                              TOKENEXTRA **tp, int mi_row,
                               int mi_col, BLOCK_SIZE bsize, int *rate,
                               int64_t *dist, int do_recon, int64_t best_rd) {
   VP9_COMMON * const cm = &cpi->common;
@@ -1772,7 +1490,13 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       return;
     }
   }
-  assert(mi_height_log2(bsize) == mi_width_log2(bsize));
+  assert(num_8x8_blocks_wide_lookup[bsize] ==
+             num_8x8_blocks_high_lookup[bsize]);
+
+  if (bsize == BLOCK_16X16) {
+    set_offsets(cpi, tile, mi_row, mi_col, bsize);
+    x->mb_energy = vp9_block_energy(cpi, x, bsize);
+  }
 
   // Determine partition types in search according to the speed features.
   // The threshold set here has to be of square block size.
@@ -1807,12 +1531,13 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
 
   // PARTITION_NONE
   if (partition_none_allowed) {
-    pick_sb_modes(cpi, mi_row, mi_col, &this_rate, &this_dist, bsize,
+    pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
                   get_block_context(x, bsize), best_rd);
     if (this_rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
-        set_partition_seg_context(cm, xd, mi_row, mi_col);
-        pl = partition_plane_context(xd, bsize);
+        pl = partition_plane_context(cpi->above_seg_context,
+                                     cpi->left_seg_context,
+                                     mi_row, mi_col, bsize);
         this_rate += x->partition_cost[pl][PARTITION_NONE];
       }
       sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
@@ -1860,7 +1585,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       *get_sb_index(xd, subsize) = i;
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, get_block_context(x, bsize));
-      rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+      rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize,
                         &this_rate, &this_dist, i != 3, best_rd - sum_rd);
 
       if (this_rate == INT_MAX) {
@@ -1872,8 +1597,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       }
     }
     if (sum_rd < best_rd && i == 4) {
-      set_partition_seg_context(cm, xd, mi_row, mi_col);
-      pl = partition_plane_context(xd, bsize);
+      pl = partition_plane_context(cpi->above_seg_context,
+                                   cpi->left_seg_context,
+                                   mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_SPLIT];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
@@ -1881,12 +1607,12 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
         best_dist = sum_dist;
         best_rd = sum_rd;
         *(get_sb_partitioning(x, bsize)) = subsize;
-      } else {
-        // skip rectangular partition test when larger block size
-        // gives better rd cost
-        if (cpi->sf.less_rectangular_check)
-          do_rect &= !partition_none_allowed;
       }
+    } else {
+      // skip rectangular partition test when larger block size
+      // gives better rd cost
+      if (cpi->sf.less_rectangular_check)
+        do_rect &= !partition_none_allowed;
     }
     partition_split_done = 1;
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1906,7 +1632,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
     *get_sb_index(xd, subsize) = 0;
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, get_block_context(x, bsize));
-    pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+    pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
                   get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
 
@@ -1917,7 +1643,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       *get_sb_index(xd, subsize) = 1;
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, get_block_context(x, bsize));
-      pick_sb_modes(cpi, mi_row + ms, mi_col, &this_rate,
+      pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
                     &this_dist, subsize, get_block_context(x, subsize),
                     best_rd - sum_rd);
       if (this_rate == INT_MAX) {
@@ -1929,8 +1655,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       }
     }
     if (sum_rd < best_rd) {
-      set_partition_seg_context(cm, xd, mi_row, mi_col);
-      pl = partition_plane_context(xd, bsize);
+      pl = partition_plane_context(cpi->above_seg_context,
+                                   cpi->left_seg_context,
+                                   mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_HORZ];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
@@ -1950,7 +1677,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
     *get_sb_index(xd, subsize) = 0;
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, get_block_context(x, bsize));
-    pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+    pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
                   get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
     if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
@@ -1960,7 +1687,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       *get_sb_index(xd, subsize) = 1;
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, get_block_context(x, bsize));
-      pick_sb_modes(cpi, mi_row, mi_col + ms, &this_rate,
+      pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
                     &this_dist, subsize, get_block_context(x, subsize),
                     best_rd - sum_rd);
       if (this_rate == INT_MAX) {
@@ -1972,8 +1699,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
       }
     }
     if (sum_rd < best_rd) {
-      set_partition_seg_context(cm, xd, mi_row, mi_col);
-      pl = partition_plane_context(xd, bsize);
+      pl = partition_plane_context(cpi->above_seg_context,
+                                   cpi->left_seg_context,
+                                   mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_VERT];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
@@ -1991,7 +1719,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
   *dist = best_dist;
 
   if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon)
-    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+    encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
   if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp);
     assert(best_rate < INT_MAX);
@@ -2002,10 +1730,10 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
 }
 
 // Examines 64x64 block and chooses a best reference frame
-static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
+static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile,
+                                    int mi_row, int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
   int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl;
   int ms = bs / 2;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
@@ -2023,10 +1751,10 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
   if ((mi_row + (ms >> 1) < cm->mi_rows) &&
       (mi_col + (ms >> 1) < cm->mi_cols)) {
     cpi->set_ref_frame_mask = 1;
-    pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_64X64,
+    pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64,
                   get_block_context(x, BLOCK_64X64), INT64_MAX);
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    pl = partition_plane_context(xd, BLOCK_64X64);
+    pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+                                 mi_row, mi_col, BLOCK_64X64);
     r += x->partition_cost[pl][PARTITION_NONE];
 
     *(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64;
@@ -2036,27 +1764,27 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
   restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
 }
 
-static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
-                          int *totalrate) {
+static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+                          int mi_row, TOKENEXTRA **tp, int *totalrate) {
   VP9_COMMON * const cm = &cpi->common;
   int mi_col;
 
   // Initialize the left context for the new SB row
-  vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
-  vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
+  vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context));
+  vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
     int dummy_rate;
     int64_t dummy_dist;
 
-    vpx_memset(cpi->mb.pred_mv, 0, sizeof(cpi->mb.pred_mv));
+    vp9_zero(cpi->mb.pred_mv);
 
     if (cpi->sf.reference_masking)
-      rd_pick_reference_frame(cpi, mi_row, mi_col);
+      rd_pick_reference_frame(cpi, tile, mi_row, mi_col);
 
-    if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning ||
+    if (cpi->sf.use_lastframe_partitioning ||
         cpi->sf.use_one_partition_size_always ) {
       const int idx_str = cm->mode_info_stride * mi_row + mi_col;
       MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
@@ -2064,13 +1792,9 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
 
       cpi->mb.source_variance = UINT_MAX;
       if (cpi->sf.use_one_partition_size_always) {
-        set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
-        set_partitioning(cpi, mi_8x8, mi_row, mi_col);
-        rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                         &dummy_rate, &dummy_dist, 1);
-      } else if (cpi->sf.partition_by_variance) {
-        choose_partitioning(cpi, cm->mi_grid_visible, mi_row, mi_col);
-        rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+        set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
+        rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
       } else {
         if ((cpi->common.current_video_frame
@@ -2078,31 +1802,34 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
             || cm->prev_mi == 0
             || cpi->common.show_frame == 0
             || cpi->common.frame_type == KEY_FRAME
-            || cpi->is_src_frame_alt_ref) {
+            || cpi->is_src_frame_alt_ref
+            || ((cpi->sf.use_lastframe_partitioning ==
+                 LAST_FRAME_PARTITION_LOW_MOTION) &&
+                 sb_has_motion(cpi, prev_mi_8x8))) {
           // If required set upper and lower partition size limits
           if (cpi->sf.auto_min_max_partition_size) {
-            set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
-            rd_auto_partition_range(cpi, mi_row, mi_col,
+            set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+            rd_auto_partition_range(cpi, tile, mi_row, mi_col,
                                     &cpi->sf.min_partition_size,
                                     &cpi->sf.max_partition_size);
           }
-          rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64,
+          rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
                             &dummy_rate, &dummy_dist, 1, INT64_MAX);
         } else {
           copy_partitioning(cpi, mi_8x8, prev_mi_8x8);
-          rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+          rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                            &dummy_rate, &dummy_dist, 1);
         }
       }
     } else {
       // If required set upper and lower partition size limits
       if (cpi->sf.auto_min_max_partition_size) {
-        set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
-        rd_auto_partition_range(cpi, mi_row, mi_col,
+        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+        rd_auto_partition_range(cpi, tile, mi_row, mi_col,
                                 &cpi->sf.min_partition_size,
                                 &cpi->sf.max_partition_size);
       }
-      rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64,
+      rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
                         &dummy_rate, &dummy_dist, 1, INT64_MAX);
     }
   }
@@ -2120,7 +1847,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   xd->mode_info_stride = cm->mode_info_stride;
 
   // reset intra mode contexts
-  if (cm->frame_type == KEY_FRAME)
+  if (frame_is_intra_only(cm))
     vp9_init_mbmode_probs(cm);
 
   // Copy data over into macro block data structures.
@@ -2129,16 +1856,16 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   // TODO(jkoleszar): are these initializations required?
   setup_pre_planes(xd, 0, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]],
                    0, 0, NULL);
-  setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
+  setup_dst_planes(xd, get_frame_new_buffer(cm), 0, 0);
 
   setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
-  xd->this_mi->mbmi.mode = DC_PRED;
-  xd->this_mi->mbmi.uv_mode = DC_PRED;
+  xd->mi_8x8[0]->mbmi.mode = DC_PRED;
+  xd->mi_8x8[0]->mbmi.uv_mode = DC_PRED;
 
-  vp9_zero(cpi->y_mode_count)
-  vp9_zero(cpi->y_uv_mode_count)
-  vp9_zero(cm->counts.inter_mode)
+  vp9_zero(cpi->y_mode_count);
+  vp9_zero(cpi->y_uv_mode_count);
+  vp9_zero(cm->counts.inter_mode);
   vp9_zero(cpi->partition_count);
   vp9_zero(cpi->intra_inter_count);
   vp9_zero(cpi->comp_inter_count);
@@ -2149,29 +1876,26 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(cm->above_context[0], 0,
-             sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols);
-  vpx_memset(cm->above_seg_context, 0,
-             sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
+  vpx_memset(cpi->above_context[0], 0,
+             sizeof(*cpi->above_context[0]) *
+             2 * aligned_mi_cols * MAX_MB_PLANE);
+  vpx_memset(cpi->above_seg_context, 0,
+             sizeof(*cpi->above_seg_context) * aligned_mi_cols);
 }
 
 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   if (lossless) {
     // printf("Switching to lossless\n");
-    cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
-    cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
+    cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
+    cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
     cpi->mb.optimize = 0;
     cpi->common.lf.filter_level = 0;
     cpi->zbin_mode_boost_enabled = 0;
     cpi->common.tx_mode = ONLY_4X4;
   } else {
     // printf("Not lossless\n");
-    cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
-    cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
+    cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
+    cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
   }
 }
 
@@ -2204,21 +1928,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
   totalrate = 0;
 
-  // Reset frame count of inter 0,0 motion vector usage.
-  cpi->inter_zz_count = 0;
-
   vp9_zero(cm->counts.switchable_interp);
-  vp9_zero(cpi->txfm_stepdown_count);
+  vp9_zero(cpi->tx_stepdown_count);
 
   xd->mi_8x8 = cm->mi_grid_visible;
   // required for vp9_frame_init_quantizer
-  xd->this_mi =
   xd->mi_8x8[0] = cm->mi;
-  xd->mic_stream_ptr = cm->mi;
 
   xd->last_mi = cm->prev_mi;
 
-
   vp9_zero(cpi->NMVcount);
   vp9_zero(cpi->coef_counts);
   vp9_zero(cm->counts.eob_branch);
@@ -2229,7 +1947,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
   vp9_frame_init_quantizer(cpi);
 
-  vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q);
+  vp9_initialize_rd_consts(cpi);
   vp9_initialize_me_consts(cpi, cm->base_qindex);
   switch_tx_mode(cpi);
 
@@ -2263,16 +1981,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
       const int tile_rows = 1 << cm->log2_tile_rows;
 
       for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-        vp9_get_tile_row_offsets(cm, tile_row);
-
         for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+          TileInfo tile;
           TOKENEXTRA *tp_old = tp;
 
           // For each row of SBs in the frame
-          vp9_get_tile_col_offsets(cm, tile_col);
-          for (mi_row = cm->cur_tile_mi_row_start;
-               mi_row < cm->cur_tile_mi_row_end; mi_row += 8)
-            encode_sb_row(cpi, mi_row, &tp, &totalrate);
+          vp9_tile_init(&tile, cm, tile_row, tile_col);
+          for (mi_row = tile.mi_row_start;
+               mi_row < tile.mi_row_end; mi_row += 8)
+            encode_sb_row(cpi, &tile, mi_row, &tp, &totalrate);
 
           cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -2306,7 +2023,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   // Keep record of the total distortion this time around for future use
   cpi->last_frame_distortion = cpi->frame_distortion;
 #endif
-
 }
 
 static int check_dual_ref_flags(VP9_COMP *cpi) {
@@ -2347,18 +2063,19 @@ static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO **mi_8x8,
                                    int mis, TX_SIZE max_tx_size, int bw, int bh,
                                    int mi_row, int mi_col, BLOCK_SIZE bsize) {
   VP9_COMMON * const cm = &cpi->common;
-  MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi;
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
     return;
-
-  if (mbmi->tx_size > max_tx_size) {
-    const int ymbs = MIN(bh, cm->mi_rows - mi_row);
-    const int xmbs = MIN(bw, cm->mi_cols - mi_col);
-
-    assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
-           get_skip_flag(mi_8x8, mis, ymbs, xmbs));
-    set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size);
+  } else {
+    MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi;
+    if (mbmi->tx_size > max_tx_size) {
+      const int ymbs = MIN(bh, cm->mi_rows - mi_row);
+      const int xmbs = MIN(bw, cm->mi_cols - mi_col);
+
+      assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
+             get_skip_flag(mi_8x8, mis, ymbs, xmbs));
+      set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size);
+    }
   }
 }
 
@@ -2424,7 +2141,7 @@ static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
 
 static int get_frame_type(VP9_COMP *cpi) {
   int frame_type;
-  if (cpi->common.frame_type == KEY_FRAME)
+  if (frame_is_intra_only(&cpi->common))
     frame_type = 0;
   else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
     frame_type = 3;
@@ -2453,9 +2170,9 @@ static void select_tx_mode(VP9_COMP *cpi) {
       unsigned int total = 0;
       int i;
       for (i = 0; i < TX_SIZES; ++i)
-        total += cpi->txfm_stepdown_count[i];
+        total += cpi->tx_stepdown_count[i];
       if (total) {
-        double fraction = (double)cpi->txfm_stepdown_count[0] / total;
+        double fraction = (double)cpi->tx_stepdown_count[0] / total;
         cpi->common.tx_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT;
         // printf("fraction = %f\n", fraction);
       }  // else keep unchanged
@@ -2472,21 +2189,23 @@ void vp9_encode_frame(VP9_COMP *cpi) {
   // requires further work in the rd loop. For now the only supported encoder
   // side behavior is where the ALT ref buffer has opposite sign bias to
   // the other two.
-  if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
-       == cm->ref_frame_sign_bias[GOLDEN_FRAME])
-      || (cm->ref_frame_sign_bias[ALTREF_FRAME]
-          == cm->ref_frame_sign_bias[LAST_FRAME])) {
-    cm->allow_comp_inter_inter = 0;
-  } else {
-    cm->allow_comp_inter_inter = 1;
-    cm->comp_fixed_ref = ALTREF_FRAME;
-    cm->comp_var_ref[0] = LAST_FRAME;
-    cm->comp_var_ref[1] = GOLDEN_FRAME;
+  if (!frame_is_intra_only(cm)) {
+    if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
+         == cm->ref_frame_sign_bias[GOLDEN_FRAME])
+        || (cm->ref_frame_sign_bias[ALTREF_FRAME]
+            == cm->ref_frame_sign_bias[LAST_FRAME])) {
+      cm->allow_comp_inter_inter = 0;
+    } else {
+      cm->allow_comp_inter_inter = 1;
+      cm->comp_fixed_ref = ALTREF_FRAME;
+      cm->comp_var_ref[0] = LAST_FRAME;
+      cm->comp_var_ref[1] = GOLDEN_FRAME;
+    }
   }
 
   if (cpi->sf.RD) {
     int i, pred_type;
-    INTERPOLATIONFILTERTYPE filter_type;
+    INTERPOLATION_TYPE filter_type;
     /*
      * This code does a single RD pass over the whole frame assuming
      * either compound, single or hybrid prediction as per whatever has
@@ -2554,7 +2273,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
     }
 
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs;
       cpi->rd_filter_threshes[frame_type][i] =
           (cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
@@ -2627,7 +2346,6 @@ void vp9_encode_frame(VP9_COMP *cpi) {
   } else {
     encode_frame_internal(cpi);
   }
-
 }
 
 static void sum_intra_stats(VP9_COMP *cpi, const MODE_INFO *mi) {
@@ -2732,7 +2450,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
     int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[0])];
     YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];
     YV12_BUFFER_CONFIG *second_ref_fb = NULL;
-    if (mbmi->ref_frame[1] > 0) {
+    if (has_second_ref(mbmi)) {
       idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[1])];
       second_ref_fb = &cm->yv12_fb[idx];
     }
@@ -2744,7 +2462,6 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
     setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
                      &xd->scale_factor[1]);
 
-
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
   }
 
@@ -2770,7 +2487,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
             (mbmi->skip_coeff ||
              vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
       const uint8_t context = vp9_get_pred_context_tx_size(xd);
-      update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx);
+      ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size];
     } else {
       int x, y;
       TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode];
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.h b/libvpx/vp9/encoder/vp9_encodeframe.h
index 3991969..3e9f538 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.h
+++ b/libvpx/vp9/encoder/vp9_encodeframe.h
@@ -17,6 +17,6 @@ struct yv12_buffer_config;
 
 void vp9_setup_src_planes(struct macroblock *x,
                           const struct yv12_buffer_config *src,
-                          int mb_row, int mb_col);
+                          int mi_row, int mi_col);
 
 #endif  // VP9_ENCODER_VP9_ENCODEFRAME_H_
diff --git a/libvpx/vp9/encoder/vp9_encodeintra.c b/libvpx/vp9/encoder/vp9_encodeintra.c
index c5e5dff..32b4593 100644
--- a/libvpx/vp9/encoder/vp9_encodeintra.c
+++ b/libvpx/vp9/encoder/vp9_encodeintra.c
@@ -9,7 +9,7 @@
  */
 
 #include "./vpx_config.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/encoder/vp9_encodemb.h"
diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index 8dd80a5..75ed8ea 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c
@@ -8,19 +8,22 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+
+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
-#include "vp9/encoder/vp9_encodemb.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_reconinter.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/common/vp9_systemdependent.h"
-#include "vp9_rtcd.h"
 
-DECLARE_ALIGNED(16, extern const uint8_t,
-                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+#include "vp9/encoder/vp9_dct.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_tokenize.h"
 
 void vp9_subtract_block_c(int rows, int cols,
                           int16_t *diff_ptr, ptrdiff_t diff_stride,
@@ -38,37 +41,6 @@ void vp9_subtract_block_c(int rows, int cols,
   }
 }
 
-static void inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob,
-                                        int16_t *dqcoeff, uint8_t *dest,
-                                        int stride) {
-  if (eob <= 1)
-    xd->inv_txm4x4_1_add(dqcoeff, dest, stride);
-  else
-    xd->inv_txm4x4_add(dqcoeff, dest, stride);
-}
-
-static void inverse_transform_b_8x8_add(int eob,
-                                        int16_t *dqcoeff, uint8_t *dest,
-                                        int stride) {
-  if (eob <= 1)
-    vp9_short_idct8x8_1_add(dqcoeff, dest, stride);
-  else if (eob <= 10)
-    vp9_short_idct10_8x8_add(dqcoeff, dest, stride);
-  else
-    vp9_short_idct8x8_add(dqcoeff, dest, stride);
-}
-
-static void inverse_transform_b_16x16_add(int eob,
-                                          int16_t *dqcoeff, uint8_t *dest,
-                                          int stride) {
-  if (eob <= 1)
-    vp9_short_idct16x16_1_add(dqcoeff, dest, stride);
-  else if (eob <= 10)
-    vp9_short_idct10_16x16_add(dqcoeff, dest, stride);
-  else
-    vp9_short_idct16x16_add(dqcoeff, dest, stride);
-}
-
 static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const MACROBLOCKD *const xd = &x->e_mbd;
@@ -97,8 +69,7 @@ void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   vp9_subtract_sbuv(x, bsize);
 }
 
-
-#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
+#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
 typedef struct vp9_token_state vp9_token_state;
 
 struct vp9_token_state {
@@ -109,7 +80,7 @@ struct vp9_token_state {
   short         qc;
 };
 
-// TODO: experiments to find optimal multiple numbers
+// TODO(jimbankoski): experiment to find optimal RD numbers.
 #define Y1_RD_MULT 4
 #define UV_RD_MULT 2
 
@@ -147,7 +118,7 @@ static void optimize_b(MACROBLOCK *mb,
                        TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblockd_plane *pd = &xd->plane[plane];
-  const int ref = is_inter_block(&xd->this_mi->mbmi);
+  const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
   const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block);
@@ -161,40 +132,18 @@ static void optimize_b(MACROBLOCK *mb,
   int best, band, pt;
   PLANE_TYPE type = pd->plane_type;
   int err_mult = plane_rd_mult[type];
-  int default_eob;
+  const int default_eob = 16 << (tx_size << 1);
   const int16_t *scan, *nb;
   const int mul = 1 + (tx_size == TX_32X32);
   uint8_t token_cache[1024];
   const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block);
   const int16_t *dequant_ptr = pd->dequant;
-  const uint8_t * band_translate;
+  const uint8_t *const band_translate = get_band_translate(tx_size);
 
   assert((!type && !plane) || (type && plane));
   dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
   qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
-  switch (tx_size) {
-    default:
-    case TX_4X4:
-      default_eob = 16;
-      scan = get_scan_4x4(get_tx_type_4x4(type, xd, ib));
-      band_translate = vp9_coefband_trans_4x4;
-      break;
-    case TX_8X8:
-      scan = get_scan_8x8(get_tx_type_8x8(type, xd));
-      default_eob = 64;
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-    case TX_16X16:
-      scan = get_scan_16x16(get_tx_type_16x16(type, xd));
-      default_eob = 256;
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-    case TX_32X32:
-      scan = vp9_default_scan_32x32;
-      default_eob = 1024;
-      band_translate = vp9_coefband_trans_8x8plus;
-      break;
-  }
+  get_scan(xd, tx_size, type, ib, &scan, &nb);
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
@@ -213,7 +162,6 @@ static void optimize_b(MACROBLOCK *mb,
   for (i = 0; i < eob; i++)
     token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
         qcoeff_ptr[scan[i]]].token];
-  nb = vp9_get_coef_neighbors_handle(scan);
 
   for (i = eob; i-- > i0;) {
     int base_bits, d2, dx;
@@ -312,11 +260,10 @@ static void optimize_b(MACROBLOCK *mb,
       best_index[i][1] = best;
       /* Finally, make this the new head of the trellis. */
       next = i;
-    }
-    /* There's no choice to make for a zero coefficient, so we don't
-     *  add a new trellis node, but we do need to update the costs.
-     */
-    else {
+    } else {
+      /* There's no choice to make for a zero coefficient, so we don't
+       *  add a new trellis node, but we do need to update the costs.
+       */
       band = get_coef_band(band_translate, i + 1);
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
@@ -385,38 +332,12 @@ static void optimize_init_b(int plane, BLOCK_SIZE bsize,
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+  const MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
-  int i;
 
-  switch (tx_size) {
-    case TX_4X4:
-      vpx_memcpy(args->ctx->ta[plane], pd->above_context,
-                 sizeof(ENTROPY_CONTEXT) * num_4x4_w);
-      vpx_memcpy(args->ctx->tl[plane], pd->left_context,
-                 sizeof(ENTROPY_CONTEXT) * num_4x4_h);
-      break;
-    case TX_8X8:
-      for (i = 0; i < num_4x4_w; i += 2)
-        args->ctx->ta[plane][i] = !!*(uint16_t *)&pd->above_context[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        args->ctx->tl[plane][i] = !!*(uint16_t *)&pd->left_context[i];
-      break;
-    case TX_16X16:
-      for (i = 0; i < num_4x4_w; i += 4)
-        args->ctx->ta[plane][i] = !!*(uint32_t *)&pd->above_context[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        args->ctx->tl[plane][i] = !!*(uint32_t *)&pd->left_context[i];
-      break;
-    case TX_32X32:
-      for (i = 0; i < num_4x4_w; i += 8)
-        args->ctx->ta[plane][i] = !!*(uint64_t *)&pd->above_context[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        args->ctx->tl[plane][i] = !!*(uint64_t *)&pd->left_context[i];
-      break;
-    default:
-      assert(0);
-  }
+  vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane],
+                           pd->above_context, pd->left_context,
+                           num_4x4_w, num_4x4_h);
 }
 
 void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -445,9 +366,9 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
       yoff = 32 * (block >> twl);
       src_diff = p->src_diff + 4 * bw * yoff + xoff;
       if (x->use_lp32x32fdct)
-        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
+        vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
       else
-        vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+        vp9_fdct32x32(src_diff, coeff, bw * 4);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -459,7 +380,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
       xoff = 16 * (block & twmask);
       yoff = 16 * (block >> twl);
       src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      x->fwd_txm16x16(src_diff, coeff, bw * 8);
+      vp9_fdct16x16(src_diff, coeff, bw * 4);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -471,7 +392,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
       xoff = 8 * (block & twmask);
       yoff = 8 * (block >> twl);
       src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      x->fwd_txm8x8(src_diff, coeff, bw * 8);
+      vp9_fdct8x8(src_diff, coeff, bw * 4);
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -482,7 +403,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
       xoff = 4 * (block & twmask);
       yoff = 4 * (block >> twl);
       src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      x->fwd_txm4x4(src_diff, coeff, bw * 8);
+      x->fwd_txm4x4(src_diff, coeff, bw * 4);
       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -497,6 +418,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx *const ctx = args->ctx;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
                                                        block);
@@ -504,38 +426,68 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
                                                  pd->dst.buf, pd->dst.stride);
+
+  // TODO(jingning): per transformed block zero forcing only enabled for
+  // luma component. will integrate chroma components as well.
+  if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+    int i, j;
+    pd->eobs[block] = 0;
+    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+    ctx->ta[plane][i] = 0;
+    ctx->tl[plane][j] = 0;
+    return;
+  }
+
   vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
 
   if (x->optimize)
-    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
+    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
 
   if (x->skip_encode || pd->eobs[block] == 0)
     return;
 
   switch (tx_size) {
     case TX_32X32:
-      vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
+      vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
       break;
     case TX_16X16:
-      inverse_transform_b_16x16_add(pd->eobs[block], dqcoeff, dst,
-                                    pd->dst.stride);
+      vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
       break;
     case TX_8X8:
-      inverse_transform_b_8x8_add(pd->eobs[block], dqcoeff, dst,
-                                  pd->dst.stride);
+      vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
       break;
     case TX_4X4:
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
-      inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
-                                  dst, pd->dst.stride);
+      xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
       break;
     default:
       assert(!"Invalid transform size");
   }
 }
 
+static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
+                               TX_SIZE tx_size, void *arg) {
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
+                                                       block);
+
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
+                                                 pd->dst.buf, pd->dst.stride);
+
+  vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
+
+  if (pd->eobs[block] == 0)
+    return;
+
+  xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+}
+
 void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
@@ -545,7 +497,7 @@ void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
   if (x->optimize)
     optimize_init_b(0, bsize, &arg);
 
-  foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
+  foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1, &arg);
 }
 
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -569,7 +521,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
@@ -607,14 +559,14 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       vp9_subtract_block(32, 32, src_diff, bw * 4,
                          src, p->src.stride, dst, pd->dst.stride);
       if (x->use_lp32x32fdct)
-        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
+        vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
       else
-        vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+        vp9_fdct32x32(src_diff, coeff, bw * 4);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, scan, iscan);
       if (!x->skip_encode && *eob)
-        vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
+        vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob);
       break;
     case TX_16X16:
       tx_type = get_tx_type_16x16(pd->plane_type, xd);
@@ -631,19 +583,12 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                               dst, pd->dst.stride, dst, pd->dst.stride);
       vp9_subtract_block(16, 16, src_diff, bw * 4,
                          src, p->src.stride, dst, pd->dst.stride);
-      if (tx_type != DCT_DCT)
-        vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
-      else
-        x->fwd_txm16x16(src_diff, coeff, bw * 8);
+      vp9_fht16x16(tx_type, src_diff, coeff, bw * 4);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
-      if (!x->skip_encode && *eob) {
-        if (tx_type == DCT_DCT)
-          inverse_transform_b_16x16_add(*eob, dqcoeff, dst, pd->dst.stride);
-        else
-          vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
-      }
+      if (!x->skip_encode && *eob)
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
       break;
     case TX_8X8:
       tx_type = get_tx_type_8x8(pd->plane_type, xd);
@@ -660,26 +605,19 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                               dst, pd->dst.stride, dst, pd->dst.stride);
       vp9_subtract_block(8, 8, src_diff, bw * 4,
                          src, p->src.stride, dst, pd->dst.stride);
-      if (tx_type != DCT_DCT)
-        vp9_short_fht8x8(src_diff, coeff, bw * 4, tx_type);
-      else
-        x->fwd_txm8x8(src_diff, coeff, bw * 8);
+      vp9_fht8x8(tx_type, src_diff, coeff, bw * 4);
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
-      if (!x->skip_encode && *eob) {
-        if (tx_type == DCT_DCT)
-          inverse_transform_b_8x8_add(*eob, dqcoeff, dst, pd->dst.stride);
-        else
-          vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
-      }
+      if (!x->skip_encode && *eob)
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
       break;
     case TX_4X4:
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
       scan = get_scan_4x4(tx_type);
       iscan = get_iscan_4x4(tx_type);
       if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
-        mode = xd->this_mi->bmi[block].as_mode;
+        mode = xd->mi_8x8[0]->bmi[block].as_mode;
       else
         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
 
@@ -695,7 +633,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       if (tx_type != DCT_DCT)
         vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
       else
-        x->fwd_txm4x4(src_diff, coeff, bw * 8);
+        x->fwd_txm4x4(src_diff, coeff, bw * 4);
       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -704,9 +642,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          inverse_transform_b_4x4_add(xd, *eob, dqcoeff, dst, pd->dst.stride);
+          xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob);
         else
-          vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
+          vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type);
       }
       break;
     default:
diff --git a/libvpx/vp9/encoder/vp9_encodemb.h b/libvpx/vp9/encoder/vp9_encodemb.h
index 54e69fd..61dd735 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/libvpx/vp9/encoder/vp9_encodemb.h
@@ -16,32 +16,17 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-typedef enum {
-  RD_DC_PRED = DC_PRED,
-  RD_V_PRED =  V_PRED,
-  RD_H_PRED = H_PRED,
-  RD_D45_PRED = D45_PRED,
-  RD_D135_PRED = D135_PRED,
-  RD_D117_PRED = D117_PRED,
-  RD_D153_PRED = D153_PRED,
-  RD_D207_PRED = D207_PRED,
-  RD_D63_PRED = D63_PRED,
-  RD_TM_PRED = TM_PRED,
-  RD_NEARESTMV = NEARESTMV,
-  RD_NEARMV = NEARMV,
-  RD_ZEROMV = ZEROMV,
-  RD_NEWMV = NEWMV,
-  RD_I4X4_PRED,
-  RD_SPLITMV,
-  RD_MODE_COUNT
-} RD_PREDICTION_MODE;
-
 typedef struct {
-  RD_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE mode;
   MV_REFERENCE_FRAME ref_frame;
   MV_REFERENCE_FRAME second_ref_frame;
 } MODE_DEFINITION;
 
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame;
+  MV_REFERENCE_FRAME second_ref_frame;
+} REF_DEFINITION;
+
 struct optimize_ctx {
   ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c
index ed3a2bb..e2c6c4c 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/libvpx/vp9/encoder/vp9_encodemv.c
@@ -8,13 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
 
 #include "vp9/common/vp9_common.h"
-#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_systemdependent.h"
+#include "vp9/encoder/vp9_encodemv.h"
 
-#include <math.h>
 
 #ifdef ENTROPY_STATS
 extern unsigned int active_section;
@@ -124,8 +124,9 @@ static void build_nmv_component_cost_table(int *mvcost,
   }
 }
 
-static int update_mv(vp9_writer *w, const unsigned int ct[2],
-                     vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) {
+static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p,
+                     vp9_prob upd_p) {
+  const vp9_prob new_p = get_binary_prob(ct[0], ct[1]);
   vp9_prob mod_p = new_p | 1;
   const int cur_b = cost_branch256(ct, *cur_p);
   const int mod_b = cost_branch256(ct, mod_p);
@@ -143,7 +144,6 @@ static int update_mv(vp9_writer *w, const unsigned int ct[2],
 
 static void counts_to_nmv_context(
     nmv_context_counts *nmv_count,
-    nmv_context *prob,
     int usehp,
     unsigned int (*branch_ct_joint)[2],
     unsigned int (*branch_ct_sign)[2],
@@ -156,29 +156,24 @@ static void counts_to_nmv_context(
     unsigned int (*branch_ct_hp)[2]) {
   int i, j, k;
   vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
-                                   prob->joints,
                                    branch_ct_joint,
                                    nmv_count->joints, 0);
   for (i = 0; i < 2; ++i) {
     const uint32_t s0 = nmv_count->comps[i].sign[0];
     const uint32_t s1 = nmv_count->comps[i].sign[1];
 
-    prob->comps[i].sign = get_binary_prob(s0, s1);
     branch_ct_sign[i][0] = s0;
     branch_ct_sign[i][1] = s1;
     vp9_tree_probs_from_distribution(vp9_mv_class_tree,
-                                     prob->comps[i].classes,
-                                     branch_ct_classes[i],
-                                     nmv_count->comps[i].classes, 0);
+                                    branch_ct_classes[i],
+                                    nmv_count->comps[i].classes, 0);
     vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
-                                     prob->comps[i].class0,
                                      branch_ct_class0[i],
                                      nmv_count->comps[i].class0, 0);
     for (j = 0; j < MV_OFFSET_BITS; ++j) {
       const uint32_t b0 = nmv_count->comps[i].bits[j][0];
       const uint32_t b1 = nmv_count->comps[i].bits[j][1];
 
-      prob->comps[i].bits[j] = get_binary_prob(b0, b1);
       branch_ct_bits[i][j][0] = b0;
       branch_ct_bits[i][j][1] = b1;
     }
@@ -186,12 +181,10 @@ static void counts_to_nmv_context(
   for (i = 0; i < 2; ++i) {
     for (k = 0; k < CLASS0_SIZE; ++k) {
       vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                       prob->comps[i].class0_fp[k],
                                        branch_ct_class0_fp[i][k],
                                        nmv_count->comps[i].class0_fp[k], 0);
     }
     vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                     prob->comps[i].fp,
                                      branch_ct_fp[i],
                                      nmv_count->comps[i].fp, 0);
   }
@@ -202,11 +195,9 @@ static void counts_to_nmv_context(
       const uint32_t hp0 = nmv_count->comps[i].hp[0];
       const uint32_t hp1 = nmv_count->comps[i].hp[1];
 
-      prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
       branch_ct_class0_hp[i][0] = c0_hp0;
       branch_ct_class0_hp[i][1] = c0_hp1;
 
-      prob->comps[i].hp = get_binary_prob(hp0, hp1);
       branch_ct_hp[i][0] = hp0;
       branch_ct_hp[i][1] = hp1;
     }
@@ -215,7 +206,6 @@ static void counts_to_nmv_context(
 
 void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
   int i, j;
-  nmv_context prob;
   unsigned int branch_ct_joint[MV_JOINTS - 1][2];
   unsigned int branch_ct_sign[2][2];
   unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
@@ -227,30 +217,28 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
   unsigned int branch_ct_hp[2][2];
   nmv_context *mvc = &cpi->common.fc.nmvc;
 
-  counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
+  counts_to_nmv_context(&cpi->NMVcount, usehp,
                         branch_ct_joint, branch_ct_sign, branch_ct_classes,
                         branch_ct_class0, branch_ct_bits,
                         branch_ct_class0_fp, branch_ct_fp,
                         branch_ct_class0_hp, branch_ct_hp);
 
   for (j = 0; j < MV_JOINTS - 1; ++j)
-    update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j],
-              NMV_UPDATE_PROB);
+    update_mv(bc, branch_ct_joint[j], &mvc->joints[j], NMV_UPDATE_PROB);
 
   for (i = 0; i < 2; ++i) {
-    update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign,
-              prob.comps[i].sign, NMV_UPDATE_PROB);
+    update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, NMV_UPDATE_PROB);
     for (j = 0; j < MV_CLASSES - 1; ++j)
       update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
-                prob.comps[i].classes[j], NMV_UPDATE_PROB);
+                NMV_UPDATE_PROB);
 
     for (j = 0; j < CLASS0_SIZE - 1; ++j)
       update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
-                prob.comps[i].class0[j], NMV_UPDATE_PROB);
+                NMV_UPDATE_PROB);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
       update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
-                prob.comps[i].bits[j], NMV_UPDATE_PROB);
+                NMV_UPDATE_PROB);
   }
 
   for (i = 0; i < 2; ++i) {
@@ -258,21 +246,19 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
       int k;
       for (k = 0; k < 3; ++k)
         update_mv(bc, branch_ct_class0_fp[i][j][k],
-                  &mvc->comps[i].class0_fp[j][k],
-                  prob.comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
+                  &mvc->comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
     }
 
     for (j = 0; j < 3; ++j)
-      update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j],
-                prob.comps[i].fp[j], NMV_UPDATE_PROB);
+      update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], NMV_UPDATE_PROB);
   }
 
   if (usehp) {
     for (i = 0; i < 2; ++i) {
       update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
-                prob.comps[i].class0_hp, NMV_UPDATE_PROB);
+                NMV_UPDATE_PROB);
       update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
-                prob.comps[i].hp, NMV_UPDATE_PROB);
+                NMV_UPDATE_PROB);
     }
   }
 }
@@ -314,44 +300,34 @@ void vp9_build_nmv_cost_table(int *mvjoint,
     build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp);
 }
 
-void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
-                         int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
+static void inc_mvs(int_mv mv[2], int_mv ref[2], int is_compound,
+                    nmv_context_counts *counts) {
+  int i;
+  for (i = 0; i < 1 + is_compound; ++i) {
+    const MV diff = { mv[i].as_mv.row - ref[i].as_mv.row,
+                      mv[i].as_mv.col - ref[i].as_mv.col };
+    vp9_inc_mv(&diff, counts);
+  }
+}
+
+void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) {
   MODE_INFO *mi = x->e_mbd.mi_8x8[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-  MV diff;
-  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
-  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
-  int idx, idy;
+  const int is_compound = has_second_ref(mbmi);
 
   if (mbmi->sb_type < BLOCK_8X8) {
-    PARTITION_INFO *pi = x->partition_info;
-    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
-      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi->sb_type];
+    int idx, idy;
+
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int i = idy * 2 + idx;
-        if (pi->bmi[i].mode == NEWMV) {
-          diff.row = mi->bmi[i].as_mv[0].as_mv.row - best_ref_mv->as_mv.row;
-          diff.col = mi->bmi[i].as_mv[0].as_mv.col - best_ref_mv->as_mv.col;
-          vp9_inc_mv(&diff, &cpi->NMVcount);
-
-          if (mi->mbmi.ref_frame[1] > INTRA_FRAME) {
-            diff.row = mi->bmi[i].as_mv[1].as_mv.row -
-                         second_best_ref_mv->as_mv.row;
-            diff.col = mi->bmi[i].as_mv[1].as_mv.col -
-                         second_best_ref_mv->as_mv.col;
-            vp9_inc_mv(&diff, &cpi->NMVcount);
-          }
-        }
+        if (mi->bmi[i].as_mode == NEWMV)
+          inc_mvs(mi->bmi[i].as_mv, best_ref_mv, is_compound, &cpi->NMVcount);
       }
     }
   } else if (mbmi->mode == NEWMV) {
-    diff.row = mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row;
-    diff.col = mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col;
-    vp9_inc_mv(&diff, &cpi->NMVcount);
-
-    if (mbmi->ref_frame[1] > INTRA_FRAME) {
-      diff.row = mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row;
-      diff.col = mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col;
-      vp9_inc_mv(&diff, &cpi->NMVcount);
-    }
+    inc_mvs(mbmi->mv, best_ref_mv, is_compound, &cpi->NMVcount);
   }
 }
diff --git a/libvpx/vp9/encoder/vp9_encodemv.h b/libvpx/vp9/encoder/vp9_encodemv.h
index 2789ce1..6331778 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.h
+++ b/libvpx/vp9/encoder/vp9_encodemv.h
@@ -25,7 +25,7 @@ void vp9_build_nmv_cost_table(int *mvjoint,
                               int usehp,
                               int mvc_flag_v,
                               int mvc_flag_h);
-void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
-                          int_mv *best_ref_mv, int_mv *second_best_ref_mv);
+
+void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]);
 
 #endif  // VP9_ENCODER_VP9_ENCODEMV_H_
diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index 9cf7b83..6a3555d 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c
@@ -8,8 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "math.h"
-#include "limits.h"
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_variance.h"
@@ -23,13 +24,13 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/yv12config.h"
-#include <stdio.h>
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_vaq.h"
 #include "./vpx_scale_rtcd.h"
 // TODO(jkoleszar): for setup_dst_planes
 #include "vp9/common/vp9_reconinter.h"
@@ -77,7 +78,8 @@ static int select_cq_level(int qindex) {
 }
 
 
-// Resets the first pass file to the given position using a relative seek from the current position
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
 static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *position) {
   cpi->twopass.stats_in = position;
 }
@@ -250,8 +252,10 @@ static void avg_stats(FIRSTPASS_STATS *section) {
   section->duration   /= section->count;
 }
 
-// Calculate a modified Error used in distributing bits between easier and harder frames
-static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+static double calculate_modified_err(VP9_COMP *cpi,
+                                     FIRSTPASS_STATS *this_frame) {
   const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;
   const double av_err = stats->ssim_weighted_pred_err / stats->count;
   const double this_err = this_frame->ssim_weighted_pred_err;
@@ -260,38 +264,43 @@ static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame)
 }
 
 static const double weight_table[256] = {
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
-  0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
-  0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
-  0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.031250, 0.062500,
+  0.093750, 0.125000, 0.156250, 0.187500, 0.218750, 0.250000, 0.281250,
+  0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750, 0.500000,
+  0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
+  0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500,
+  0.968750, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000
 };
 
 static double simple_weight(YV12_BUFFER_CONFIG *source) {
@@ -300,7 +309,8 @@ static double simple_weight(YV12_BUFFER_CONFIG *source) {
   uint8_t *src = source->y_buffer;
   double sum_weights = 0.0;
 
-  // Loop throught the Y plane raw examining levels and creating a weight for the image
+  // Loop through the Y plane examining levels and creating a weight for
+  // the image.
   i = source->y_height;
   do {
     j = source->y_width;
@@ -340,13 +350,15 @@ void vp9_end_first_pass(VP9_COMP *cpi) {
   output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
 }
 
-static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
+static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                             YV12_BUFFER_CONFIG *recon_buffer,
+                             int *best_motion_err, int recon_yoffset) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Set up pointers for this macro block recon buffer
   xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;
 
-  switch (xd->this_mi->mbmi.sb_type) {
+  switch (xd->mi_8x8[0]->mbmi.sb_type) {
     case BLOCK_8X8:
       vp9_mse8x8(x->plane[0].src.buf, x->plane[0].src.stride,
                  xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
@@ -385,7 +397,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   int n;
   vp9_variance_fn_ptr_t v_fn_ptr =
-      cpi->fn_ptr[xd->this_mi->mbmi.sb_type];
+      cpi->fn_ptr[xd->mi_8x8[0]->mbmi.sb_type];
   int new_mv_mode_penalty = 256;
 
   int sr = 0;
@@ -402,7 +414,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   further_steps -= sr;
 
   // override the default variance function to use MSE
-  switch (xd->this_mi->mbmi.sb_type) {
+  switch (xd->mi_8x8[0]->mbmi.sb_type) {
     case BLOCK_8X8:
       v_fn_ptr.vf = vp9_mse8x8;
       break;
@@ -444,9 +456,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   while (n < further_steps) {
     n++;
 
-    if (num00)
+    if (num00) {
       num00--;
-    else {
+    } else {
       tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
                                         step_param + n, x->sadperbit16,
                                         &num00, &v_fn_ptr,
@@ -469,13 +481,14 @@ void vp9_first_pass(VP9_COMP *cpi) {
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
+  TileInfo tile;
 
   int recon_yoffset, recon_uvoffset;
   const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx];
   const int gld_yv12_idx = cm->ref_frame_map[cpi->gld_fb_idx];
   YV12_BUFFER_CONFIG *const lst_yv12 = &cm->yv12_fb[lst_yv12_idx];
-  YV12_BUFFER_CONFIG *const new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
   YV12_BUFFER_CONFIG *const gld_yv12 = &cm->yv12_fb[gld_yv12_idx];
+  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
   const int recon_y_stride = lst_yv12->y_stride;
   const int recon_uv_stride = lst_yv12->uv_stride;
   int64_t intra_error = 0;
@@ -504,12 +517,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
   setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL);
   setup_dst_planes(xd, new_yv12, 0, 0);
 
-  x->partition_info = x->pi;
   xd->mi_8x8 = cm->mi_grid_visible;
   // required for vp9_frame_init_quantizer
-  xd->this_mi =
   xd->mi_8x8[0] = cm->mi;
-  xd->mic_stream_ptr = cm->mi;
 
   setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
@@ -520,9 +530,12 @@ void vp9_first_pass(VP9_COMP *cpi) {
   // if ( 0 )
   {
     vp9_init_mv_probs(cm);
-    vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q);
+    vp9_initialize_rd_consts(cpi);
   }
 
+  // tiling is ignored in the first pass
+  vp9_tile_init(&tile, cm, 0, 0);
+
   // for each macroblock row in image
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
     int_mv best_ref_mv;
@@ -534,16 +547,21 @@ void vp9_first_pass(VP9_COMP *cpi) {
     recon_yoffset = (mb_row * recon_y_stride * 16);
     recon_uvoffset = (mb_row * recon_uv_stride * 8);
 
-    // Set up limit values for motion vectors to prevent them extending outside the UMV borders
-    x->mv_row_min = -((mb_row * 16) + (VP9BORDERINPIXELS - 8));
+    // Set up limit values for motion vectors to prevent them extending
+    // outside the UMV borders
+    x->mv_row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
     x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
-                    + (VP9BORDERINPIXELS - 8);
+                    + BORDER_MV_PIXELS_B16;
 
     // for each macroblock col in image
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
       int this_error;
       int gf_motion_error = INT_MAX;
       int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+      double error_weight;
+
+      vp9_clear_system_state();  // __asm emms;
+      error_weight = 1.0;  // avoid uninitialized warnings
 
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
@@ -552,40 +570,54 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
       if (mb_col * 2 + 1 < cm->mi_cols) {
         if (mb_row * 2 + 1 < cm->mi_rows) {
-          xd->this_mi->mbmi.sb_type = BLOCK_16X16;
+          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X16;
         } else {
-          xd->this_mi->mbmi.sb_type = BLOCK_16X8;
+          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X8;
         }
       } else {
         if (mb_row * 2 + 1 < cm->mi_rows) {
-          xd->this_mi->mbmi.sb_type = BLOCK_8X16;
+          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X16;
         } else {
-          xd->this_mi->mbmi.sb_type = BLOCK_8X8;
+          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X8;
         }
       }
-      xd->this_mi->mbmi.ref_frame[0] = INTRA_FRAME;
-      set_mi_row_col(cm, xd,
+      xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+      set_mi_row_col(xd, &tile,
                      mb_row << 1,
-                     1 << mi_height_log2(xd->this_mi->mbmi.sb_type),
+                     num_8x8_blocks_high_lookup[xd->mi_8x8[0]->mbmi.sb_type],
                      mb_col << 1,
-                     1 << mi_height_log2(xd->this_mi->mbmi.sb_type));
+                     num_8x8_blocks_wide_lookup[xd->mi_8x8[0]->mbmi.sb_type],
+                     cm->mi_rows, cm->mi_cols);
+
+      if (cpi->sf.variance_adaptive_quantization) {
+        int energy = vp9_block_energy(cpi, x, xd->mi_8x8[0]->mbmi.sb_type);
+        error_weight = vp9_vaq_inv_q_ratio(energy);
+      }
 
       // do intra 16x16 prediction
       this_error = vp9_encode_intra(x, use_dc_pred);
+      if (cpi->sf.variance_adaptive_quantization) {
+        vp9_clear_system_state();  // __asm emms;
+        this_error *= error_weight;
+      }
 
-      // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
-      // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
-      // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames.
+      // intrapenalty below deals with situations where the intra and inter
+      // error scores are very low (eg a plain black frame).
+      // We do not have special cases in first pass for 0,0 and nearest etc so
+      // all inter modes carry an overhead cost estimate for the mv.
+      // When the error score is very low this causes us to pick all or lots of
+      // INTRA modes and throw lots of key frames.
       // This penalty adds a cost matching that of a 0,0 mv to the intra case.
       this_error += intrapenalty;
 
       // Cumulative intra error total
       intra_error += (int64_t)this_error;
 
-      // Set up limit values for motion vectors to prevent them extending outside the UMV borders
-      x->mv_col_min = -((mb_col * 16) + (VP9BORDERINPIXELS - 8));
+      // Set up limit values for motion vectors to prevent them extending
+      // outside the UMV borders.
+      x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
       x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
-                      + (VP9BORDERINPIXELS - 8);
+                      + BORDER_MV_PIXELS_B16;
 
       // Other than for the first frame do a motion search
       if (cm->current_video_frame > 0) {
@@ -602,12 +634,21 @@ void vp9_first_pass(VP9_COMP *cpi) {
         first_pass_motion_search(cpi, x, &best_ref_mv,
                                  &mv.as_mv, lst_yv12,
                                  &motion_error, recon_yoffset);
+        if (cpi->sf.variance_adaptive_quantization) {
+          vp9_clear_system_state();  // __asm emms;
+          motion_error *= error_weight;
+        }
 
-        // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+        // If the current best reference mv is not centered on 0,0 then do a 0,0
+        // based search as well.
         if (best_ref_mv.as_int) {
           tmp_err = INT_MAX;
           first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,
                                    lst_yv12, &tmp_err, recon_yoffset);
+          if (cpi->sf.variance_adaptive_quantization) {
+            vp9_clear_system_state();  // __asm emms;
+            tmp_err *= error_weight;
+          }
 
           if (tmp_err < motion_error) {
             motion_error = tmp_err;
@@ -624,6 +665,10 @@ void vp9_first_pass(VP9_COMP *cpi) {
           first_pass_motion_search(cpi, x, &zero_ref_mv,
                                    &tmp_mv.as_mv, gld_yv12,
                                    &gf_motion_error, recon_yoffset);
+          if (cpi->sf.variance_adaptive_quantization) {
+            vp9_clear_system_state();  // __asm emms;
+            gf_motion_error *= error_weight;
+          }
 
           if ((gf_motion_error < motion_error) &&
               (gf_motion_error < this_error)) {
@@ -643,9 +688,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
             sr_coded_error += gf_motion_error;
           else
             sr_coded_error += this_error;
-        } else
+        } else {
           sr_coded_error += motion_error;
-
+        }
         /* Intra assumed best */
         best_ref_mv.as_int = 0;
 
@@ -660,17 +705,17 @@ void vp9_first_pass(VP9_COMP *cpi) {
             neutral_count++;
           }
 
-          mv.as_mv.row <<= 3;
-          mv.as_mv.col <<= 3;
+          mv.as_mv.row *= 8;
+          mv.as_mv.col *= 8;
           this_error = motion_error;
           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
-          xd->this_mi->mbmi.tx_size = TX_4X4;
-          xd->this_mi->mbmi.ref_frame[0] = LAST_FRAME;
-          xd->this_mi->mbmi.ref_frame[1] = NONE;
+          xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
+          xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
+          xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE;
           vp9_build_inter_predictors_sby(xd, mb_row << 1,
                                          mb_col << 1,
-                                         xd->this_mi->mbmi.sb_type);
-          vp9_encode_sby(x, xd->this_mi->mbmi.sb_type);
+                                         xd->mi_8x8[0]->mbmi.sb_type);
+          vp9_encode_sby(x, xd->mi_8x8[0]->mbmi.sb_type);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
@@ -717,9 +762,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
             }
           }
         }
-      } else
+      } else {
         sr_coded_error += (int64_t)this_error;
-
+      }
       coded_error += (int64_t)this_error;
 
       // adjust to the next column of macroblocks
@@ -778,16 +823,19 @@ void vp9_first_pass(VP9_COMP *cpi) {
       fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
       fps.MVc = (double)sum_mvc / (double)mvcount;
       fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
-      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;
-      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;
+      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) /
+                 (double)mvcount;
+      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) /
+                 (double)mvcount;
       fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
       fps.new_mv_count = new_mv_count;
 
       fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
     }
 
-    // TODO:  handle the case when duration is set to 0, or something less
-    // than the full time between subsequent values of cpi->source_time_stamp.
+    // TODO(paulwilkins):  Handle the case when duration is set to 0, or
+    // something less than the full time between subsequent values of
+    // cpi->source_time_stamp.
     fps.duration = (double)(cpi->source->ts_end
                             - cpi->source->ts_start);
 
@@ -807,15 +855,16 @@ void vp9_first_pass(VP9_COMP *cpi) {
         2.0))) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     cpi->twopass.sr_update_lag = 1;
-  } else
+  } else {
     cpi->twopass.sr_update_lag++;
-
+  }
   // swap frame pointers so last frame refers to the frame we just compressed
   swap_yv12(lst_yv12, new_yv12);
 
   vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y);
 
-  // Special case for the first frame. Copy into the GF buffer as a second reference.
+  // Special case for the first frame. Copy into the GF buffer as a second
+  // reference.
   if (cm->current_video_frame == 0)
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
 
@@ -823,7 +872,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
   if (0) {
     char filename[512];
     FILE *recon_file;
-    sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+    snprintf(filename, sizeof(filename), "enc%04d.yuv",
+             (int)cm->current_video_frame);
 
     if (cm->current_video_frame == 0)
       recon_file = fopen(filename, "wb");
@@ -835,7 +885,6 @@ void vp9_first_pass(VP9_COMP *cpi) {
   }
 
   cm->current_video_frame++;
-
 }
 
 // Estimate a cost per mb attributable to overheads such as the coding of
@@ -878,7 +927,7 @@ static int64_t estimate_modemvcost(VP9_COMP *cpi,
            (av_intra * intra_cost)) * cpi->common.MBs) << 9;
 
   // return mv_cost + mode_cost;
-  // TODO PGW Fix overhead costs for extended Q range
+  // TODO(paulwilkins): Fix overhead costs for extended Q range.
 #endif
   return 0;
 }
@@ -1102,8 +1151,8 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   FIRSTPASS_STATS *start_pos;
 
   double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
-  double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
-                                      * cpi->oxcf.two_pass_vbrmin_section / 100);
+  double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth *
+                                      cpi->oxcf.two_pass_vbrmin_section / 100);
 
   if (two_pass_min_rate < lower_bounds_min_rate)
     two_pass_min_rate = lower_bounds_min_rate;
@@ -1141,15 +1190,17 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   // This variable monitors how far behind the second ref update is lagging
   cpi->twopass.sr_update_lag = 1;
 
-  // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
+  // Scan the first pass file and calculate an average Intra / Inter error score
+  // ratio for the sequence.
   {
     double sum_iiratio = 0.0;
     double IIRatio;
 
-    start_pos = cpi->twopass.stats_in;               // Note starting "file" position
+    start_pos = cpi->twopass.stats_in;  // Note the starting "file" position.
 
     while (input_stats(cpi, &this_frame) != EOF) {
-      IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
+      IIRatio = this_frame.intra_error
+                / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
       IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
       sum_iiratio += IIRatio;
     }
@@ -1161,21 +1212,21 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
     reset_fpf_position(cpi, start_pos);
   }
 
-  // Scan the first pass file and calculate a modified total error based upon the bias/power function
-  // used to allocate bits
+  // Scan the first pass file and calculate a modified total error based upon
+  // the bias/power function used to allocate bits.
   {
-    start_pos = cpi->twopass.stats_in;               // Note starting "file" position
+    start_pos = cpi->twopass.stats_in;  // Note starting "file" position
 
     cpi->twopass.modified_error_total = 0.0;
     cpi->twopass.modified_error_used = 0.0;
 
     while (input_stats(cpi, &this_frame) != EOF) {
-      cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame);
+      cpi->twopass.modified_error_total +=
+          calculate_modified_err(cpi, &this_frame);
     }
     cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
 
-    reset_fpf_position(cpi, start_pos);            // Reset file position
-
+    reset_fpf_position(cpi, start_pos);  // Reset file position
   }
 }
 
@@ -1321,7 +1372,6 @@ static void accumulate_frame_motion_stats(
       (this_frame_mvc_ratio < this_frame->mvc_abs)
       ? (this_frame_mvc_ratio * motion_pct)
       : this_frame->mvc_abs * motion_pct;
-
   }
 }
 
@@ -1380,7 +1430,8 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
     // Update the motion related elements to the boost calculation
     accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
-                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+                                  &abs_mv_in_out_accumulator,
+                                  &mv_ratio_accumulator);
 
     // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
@@ -1416,7 +1467,8 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
     // Update the motion related elements to the boost calculation
     accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
-                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+                                  &abs_mv_in_out_accumulator,
+                                  &mv_ratio_accumulator);
 
     // We want to discount the the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
@@ -1432,7 +1484,6 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
 
     boost_score += (decay_accumulator *
                     calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
-
   }
   *b_boost = (int)boost_score;
 
@@ -1666,7 +1717,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Update the motion related elements to the boost calculation
     accumulate_frame_motion_stats(&next_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
-                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+                                  &abs_mv_in_out_accumulator,
+                                  &mv_ratio_accumulator);
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
@@ -1709,8 +1761,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
          (abs_mv_in_out_accumulator > 3.0) ||
          (mv_in_out_accumulator < -2.0) ||
-         ((boost_score - old_boost_score) < IIFACTOR))
-      )) {
+         ((boost_score - old_boost_score) < IIFACTOR)))) {
       boost_score = old_boost_score;
       break;
     }
@@ -1764,7 +1815,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
        (mv_in_out_accumulator > -2.0)) &&
       (boost_score > 100)) {
     // Alternative boost calculation for alt ref
-    cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+    cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
+                                    &b_boost);
     cpi->source_alt_ref_pending = 1;
 
 #if CONFIG_MULTIPLE_ARF
@@ -1841,9 +1893,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     cpi->twopass.gf_group_bits =
       (int64_t)(cpi->twopass.kf_group_bits *
                 (gf_group_err / cpi->twopass.kf_group_error_left));
-  } else
+  } else {
     cpi->twopass.gf_group_bits = 0;
-
+  }
   cpi->twopass.gf_group_bits =
     (cpi->twopass.gf_group_bits < 0)
     ? 0
@@ -1907,11 +1959,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
       if (gf_bits > alt_gf_bits)
         gf_bits = alt_gf_bits;
-    }
-    // Else if it is harder than other frames in the group make sure it at
-    // least receives an allocation in keeping with its relative error
-    // score, otherwise it may be worse off than an "un-boosted" frame
-    else {
+    } else {
+      // If it is harder than other frames in the group make sure it at
+      // least receives an allocation in keeping with its relative error
+      // score, otherwise it may be worse off than an "un-boosted" frame.
       int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits *
                         mod_frame_err /
                         DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
@@ -2023,9 +2074,9 @@ static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at
   // the top end.
-  if (target_frame_size < 0)
+  if (target_frame_size < 0) {
     target_frame_size = 0;
-  else {
+  } else {
     if (target_frame_size > max_bits)
       target_frame_size = max_bits;
 
@@ -2093,14 +2144,19 @@ void vp9_second_pass(VP9_COMP *cpi) {
       cpi->twopass.est_max_qcorrection_factor = 1.0;
 
       // Set a cq_level in constrained quality mode.
+      // Commenting this code out for now since it does not seem to be
+      // working well.
+      /*
       if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
         int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
-                                 section_target_bandwidth);
+           section_target_bandwidth);
 
-        cpi->cq_target_quality = cpi->oxcf.cq_level;
         if (est_cq > cpi->cq_target_quality)
           cpi->cq_target_quality = est_cq;
+        else
+          cpi->cq_target_quality = cpi->oxcf.cq_level;
       }
+      */
 
       // guess at maxq needed in 2nd pass
       cpi->twopass.maxq_max_limit = cpi->worst_quality;
@@ -2113,17 +2169,14 @@ void vp9_second_pass(VP9_COMP *cpi) {
       cpi->ni_av_qi = tmp_q;
       cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
 
-#ifndef ONE_SHOT_Q_ESTIMATE
       // Limit the maxq value returned subsequently.
       // This increases the risk of overspend or underspend if the initial
       // estimate for the clip is bad, but helps prevent excessive
       // variation in Q, especially near the end of a clip
       // where for example a small overspend may cause Q to crash
       adjust_maxq_qrange(cpi);
-#endif
     }
 
-#ifndef ONE_SHOT_Q_ESTIMATE
     // The last few frames of a clip almost always have to few or too many
     // bits and for the sake of over exact rate control we dont want to make
     // radical adjustments to the allowed quantizer range just to use up a
@@ -2146,7 +2199,6 @@ void vp9_second_pass(VP9_COMP *cpi) {
       cpi->active_worst_quality =
           adjust_active_maxq(cpi->active_worst_quality, tmp_q);
     }
-#endif
   }
   vp9_zero(this_frame);
   if (EOF == input_stats(cpi, &this_frame))
@@ -2243,16 +2295,17 @@ static int test_candidate_kf(VP9_COMP *cpi,
   if ((this_frame->pcnt_second_ref < 0.10) &&
       (next_frame->pcnt_second_ref < 0.10) &&
       ((this_frame->pcnt_inter < 0.05) ||
-       (
-         ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
-         ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
-         ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
-          (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
-          ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)
-         )
-       )
-      )
-     ) {
+       (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
+        ((this_frame->intra_error /
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+        ((fabs(last_frame->coded_error - this_frame->coded_error) /
+              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+          .40) ||
+         (fabs(last_frame->intra_error - this_frame->intra_error) /
+              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+          .40) ||
+         ((next_frame->intra_error /
+           DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) {
     int i;
     FIRSTPASS_STATS *start_pos;
 
@@ -2270,7 +2323,8 @@ static int test_candidate_kf(VP9_COMP *cpi,
 
     // Examine how well the key frame predicts subsequent frames
     for (i = 0; i < 16; i++) {
-      next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+      next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error /
+                      DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
 
       if (next_iiratio > RMAX)
         next_iiratio = RMAX;
@@ -2279,7 +2333,8 @@ static int test_candidate_kf(VP9_COMP *cpi,
       if (local_next_frame.pcnt_inter > 0.85)
         decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
       else
-        decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+        decay_accumulator =
+            decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
 
       // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
 
@@ -2307,9 +2362,9 @@ static int test_candidate_kf(VP9_COMP *cpi,
 
     // If there is tolerable prediction for at least the next 3 frames then
     // break out else discard this potential key frame and move on
-    if (boost_score > 30.0 && (i > 3))
+    if (boost_score > 30.0 && (i > 3)) {
       is_viable_kf = 1;
-    else {
+    } else {
       // Reset the file position
       reset_fpf_position(cpi, start_pos);
 
@@ -2369,8 +2424,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Accumulate kf group error
     kf_group_err += calculate_modified_err(cpi, this_frame);
 
-    // These figures keep intra and coded error counts for all frames including key frames in the group.
-    // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+    // These figures keep intra and coded error counts for all frames including
+    // key frames in the group. The effect of the key frame itself can be
+    // subtracted out using the first_frame data collected above.
     kf_group_intra_err += this_frame->intra_error;
     kf_group_coded_err += this_frame->coded_error;
 
@@ -2410,9 +2466,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       // forcekeyframeevery intervals then break out of the loop.
       if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency)
         break;
-    } else
+    } else {
       cpi->twopass.frames_to_key++;
-
+    }
     i++;
   }
 
@@ -2452,22 +2508,24 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     reset_fpf_position(cpi, current_pos);
 
     cpi->next_key_frame_forced = 1;
-  } else
+  } else {
     cpi->next_key_frame_forced = 0;
-
+  }
   // Special case for the last frame of the file
   if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
     // Accumulate kf group error
     kf_group_err += calculate_modified_err(cpi, this_frame);
 
-    // These figures keep intra and coded error counts for all frames including key frames in the group.
-    // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+    // These figures keep intra and coded error counts for all frames including
+    // key frames in the group. The effect of the key frame itself can be
+    // subtracted out using the first_frame data collected above.
     kf_group_intra_err += this_frame->intra_error;
     kf_group_coded_err += this_frame->coded_error;
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
-  if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0)) {
+  if ((cpi->twopass.bits_left > 0) &&
+      (cpi->twopass.modified_error_left > 0.0)) {
     // Max for a single normal frame (not key frame)
     int max_bits = frame_max_bits(cpi);
 
@@ -2484,13 +2542,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;
     if (cpi->twopass.kf_group_bits > max_grp_bits)
       cpi->twopass.kf_group_bits = max_grp_bits;
-  } else
+  } else {
     cpi->twopass.kf_group_bits = 0;
-
+  }
   // Reset the first pass file position
   reset_fpf_position(cpi, start_position);
 
-  // determine how big to make this keyframe based on how well the subsequent frames use inter blocks
+  // Determine how big to make this keyframe based on how well the subsequent
+  // frames use inter blocks.
   decay_accumulator = 1.0;
   boost_score = 0.0;
   loop_decay_rate = 1.00;       // Starting decay rate
@@ -2563,7 +2622,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (kf_boost < (cpi->twopass.frames_to_key * 3))
       kf_boost = (cpi->twopass.frames_to_key * 3);
 
-    if (kf_boost < 300) // Min KF boost
+    if (kf_boost < 300)  // Min KF boost
       kf_boost = 300;
 
     // Make a note of baseline boost and the zero motion
@@ -2598,10 +2657,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       allocation_chunks /= divisor;
     }
 
-    cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
+    cpi->twopass.kf_group_bits =
+        (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
 
     // Calculate the number of bits to be spent on the key frame
-    cpi->twopass.kf_bits  = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
+    cpi->twopass.kf_bits =
+        (int)((double)kf_boost *
+              ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
 
     // If the key frame is actually easier than the average for the
     // kf group (which does sometimes happen... eg a blank intro frame)
@@ -2619,11 +2681,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       if (cpi->twopass.kf_bits > alt_kf_bits) {
         cpi->twopass.kf_bits = alt_kf_bits;
       }
-    }
+    } else {
     // Else if it is much harder than other frames in the group make sure
     // it at least receives an allocation in keeping with its relative
     // error score
-    else {
       alt_kf_bits =
         (int)((double)cpi->twopass.bits_left *
               (kf_mod_err /
@@ -2649,6 +2710,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
 
   // Adjust the count of total modified error left.
-  // The count of bits left is adjusted elsewhere based on real coded frame sizes
+  // The count of bits left is adjusted elsewhere based on real coded frame
+  // sizes.
   cpi->twopass.modified_error_left -= kf_group_err;
 }
diff --git a/libvpx/vp9/encoder/vp9_firstpass.h b/libvpx/vp9/encoder/vp9_firstpass.h
index 2296a66..c18d11e 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/libvpx/vp9/encoder/vp9_firstpass.h
@@ -10,6 +10,7 @@
 
 #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
 #define VP9_ENCODER_VP9_FIRSTPASS_H_
+#include "vp9/encoder/vp9_onyx_int.h"
 
 void vp9_init_first_pass(VP9_COMP *cpi);
 void vp9_first_pass(VP9_COMP *cpi);
diff --git a/libvpx/vp9/encoder/vp9_lookahead.c b/libvpx/vp9/encoder/vp9_lookahead.c
index 81445a9..c28c868 100644
--- a/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/libvpx/vp9/encoder/vp9_lookahead.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/common/vp9_extend.h"
@@ -77,7 +77,7 @@ struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
         goto bail;
   }
   return ctx;
-bail:
+ bail:
   vp9_lookahead_destroy(ctx);
   return NULL;
 }
diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c
index 5a671f2..7b605b2 100644
--- a/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -10,14 +10,17 @@
 
 #include <limits.h>
 
-#include <vpx_mem/vpx_mem.h>
-#include <vp9/encoder/vp9_encodeintra.h>
-#include <vp9/encoder/vp9_rdopt.h>
-#include <vp9/common/vp9_blockd.h>
-#include <vp9/common/vp9_reconinter.h>
-#include <vp9/common/vp9_reconintra.h>
-#include <vp9/common/vp9_systemdependent.h>
-#include <vp9/encoder/vp9_segmentation.h>
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/encoder/vp9_encodeintra.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+
 
 static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
                                               int_mv *ref_mv,
@@ -46,9 +49,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit,
+  best_err = vp9_hex_search(x, &ref_full.as_mv, step_param, x->errorperbit,
                             0, &v_fn_ptr,
-                            0, ref_mv, dst_mv);
+                            0, &ref_mv->as_mv, &dst_mv->as_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -57,7 +60,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
     unsigned int sse;
     best_err = cpi->find_fractional_mv_step(
         x,
-        dst_mv, ref_mv,
+        &dst_mv->as_mv, &ref_mv->as_mv,
+        cpi->common.allow_high_precision_mv,
         x->errorperbit, &v_fn_ptr,
         0, cpi->sf.subpel_iters_per_step, NULL, NULL,
         & distortion, &sse);
@@ -100,7 +104,8 @@ static int do_16x16_motion_search(VP9_COMP *cpi, int_mv *ref_mv, int_mv *dst_mv,
     dst_mv->as_int = tmp_mv.as_int;
   }
 
-  // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+  // If the current best reference mv is not centered on 0,0 then do a 0,0
+  // based search as well.
   if (ref_mv->as_int) {
     unsigned int tmp_err;
     int_mv zero_ref_mv, tmp_mv;
@@ -145,7 +150,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi,
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     unsigned int err;
 
-    xd->this_mi->mbmi.mode = mode;
+    xd->mi_8x8[0]->mbmi.mode = mode;
     vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
                             x->plane[0].src.buf, x->plane[0].src.stride,
                             xd->plane[0].dst.buf, xd->plane[0].dst.stride);
@@ -189,8 +194,8 @@ static void update_mbgraph_mb_stats
   x->plane[0].src.buf = buf->y_buffer + mb_y_offset;
   x->plane[0].src.stride = buf->y_stride;
 
-  xd->plane[0].dst.buf = cm->yv12_fb[cm->new_fb_idx].y_buffer + mb_y_offset;
-  xd->plane[0].dst.stride = cm->yv12_fb[cm->new_fb_idx].y_stride;
+  xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset;
+  xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
 
   // do intra 16x16 prediction
   intra_error = find_best_16x16_intra(cpi, mb_y_offset,
@@ -214,7 +219,8 @@ static void update_mbgraph_mb_stats
     stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
   }
 
-  // Alt-ref frame MV search, if it exists and is different than last/golden frame
+  // Do an Alt-ref frame MV search, if it exists and is different than
+  // last/golden frame.
   if (alt_ref) {
     int a_motion_error;
     xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
@@ -243,17 +249,17 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   int_mv arf_top_mv, gld_top_mv;
   MODE_INFO mi_local = { { 0 } };
 
-  // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+  // Set up limit values for motion vectors to prevent them extending outside
+  // the UMV borders.
   arf_top_mv.as_int = 0;
   gld_top_mv.as_int = 0;
-  x->mv_row_min     = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND);
-  x->mv_row_max     = (cm->mb_rows - 1) * 8 + VP9BORDERINPIXELS
-                      - 8 - VP9_INTERP_EXTEND;
+  x->mv_row_min     = -BORDER_MV_PIXELS_B16;
+  x->mv_row_max     = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16;
   xd->up_available  = 0;
   xd->plane[0].dst.stride  = buf->y_stride;
   xd->plane[0].pre[0].stride  = buf->y_stride;
   xd->plane[1].dst.stride = buf->uv_stride;
-  xd->this_mi = &mi_local;
+  xd->mi_8x8[0] = &mi_local;
   mi_local.mbmi.sb_type = BLOCK_16X16;
   mi_local.mbmi.ref_frame[0] = LAST_FRAME;
   mi_local.mbmi.ref_frame[1] = NONE;
@@ -264,12 +270,12 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
     int arf_y_in_offset = arf_y_offset;
     int gld_y_in_offset = gld_y_offset;
 
-    // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+    // Set up limit values for motion vectors to prevent them extending outside
+    // the UMV borders.
     arf_left_mv.as_int = arf_top_mv.as_int;
     gld_left_mv.as_int = gld_top_mv.as_int;
-    x->mv_col_min      = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND);
-    x->mv_col_max      = (cm->mb_cols - 1) * 8 + VP9BORDERINPIXELS
-                         - 8 - VP9_INTERP_EXTEND;
+    x->mv_col_min      = -BORDER_MV_PIXELS_B16;
+    x->mv_col_max      = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16;
     xd->left_available = 0;
 
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
@@ -307,6 +313,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
 static void separate_arf_mbs(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int mb_col, mb_row, offset, i;
+  int mi_row, mi_col;
   int ncnt[4] = { 0 };
   int n_frames = cpi->mbgraph_n_frames;
 
@@ -343,22 +350,17 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
     }
   }
 
-  for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
-       offset += cm->mb_cols, mb_row++) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+  // arf_not_zz is indexed by MB, but this loop is indexed by MI to avoid out
+  // of bound access in segmentation_map
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
       // If any of the blocks in the sequence failed then the MB
       // goes in segment 0
-      if (arf_not_zz[offset + mb_col]) {
+      if (arf_not_zz[mi_row/2*cm->mb_cols + mi_col/2]) {
         ncnt[0]++;
-        cpi->segmentation_map[offset * 4 + 2 * mb_col] = 0;
-        cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 0;
-        cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 0;
-        cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols + 1] = 0;
+        cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
       } else {
-        cpi->segmentation_map[offset * 4 + 2 * mb_col] = 1;
-        cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 1;
-        cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 1;
-        cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols + 1] = 1;
+        cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 1;
         ncnt[1]++;
       }
     }
@@ -369,7 +371,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
   if (1) {
     // Note % of blocks that are marked as static
     if (cm->MBs)
-      cpi->static_mb_pct = (ncnt[1] * 100) / cm->MBs;
+      cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
 
     // This error case should not be reachable as this function should
     // never be called with the common data structure uninitialized.
@@ -406,7 +408,8 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
   for (i = 0; i < n_frames; i++) {
     MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
     vpx_memset(frame_stats->mb_stats, 0,
-               cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
+               cm->mb_rows * cm->mb_cols *
+               sizeof(*cpi->mbgraph_stats[i].mb_stats));
   }
 
   // do motion search to find contribution of each reference to data
diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c
index 1360088..a52f5b1 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libvpx/vp9/encoder/vp9_mcomp.c
@@ -59,38 +59,39 @@ int vp9_init_search_range(VP9_COMP *cpi, int size) {
   return sr;
 }
 
-int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
-                    int weight) {
-  MV v;
-  v.row = mv->as_mv.row - ref->as_mv.row;
-  v.col = mv->as_mv.col - ref->as_mv.col;
-  return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] +
-                             mvcost[0][v.row] +
-                             mvcost[1][v.col]) * weight, 7);
+static INLINE int mv_cost(const MV *mv,
+                          const int *joint_cost, int *comp_cost[2]) {
+  return joint_cost[vp9_get_mv_joint(mv)] +
+             comp_cost[0][mv->row] + comp_cost[1][mv->col];
 }
 
-static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
+int vp9_mv_bit_cost(const MV *mv, const MV *ref,
+                    const int *mvjcost, int *mvcost[2], int weight) {
+  const MV diff = { mv->row - ref->row,
+                    mv->col - ref->col };
+  return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
+}
+
+static int mv_err_cost(const MV *mv, const MV *ref,
+                       const int *mvjcost, int *mvcost[2],
                        int error_per_bit) {
   if (mvcost) {
-    MV v;
-    v.row = mv->as_mv.row - ref->as_mv.row;
-    v.col = mv->as_mv.col - ref->as_mv.col;
-    return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] +
-                               mvcost[0][v.row] +
-                               mvcost[1][v.col]) * error_per_bit, 13);
+    const MV diff = { mv->row - ref->row,
+                      mv->col - ref->col };
+    return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) *
+                                  error_per_bit, 13);
   }
   return 0;
 }
 
-static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvjsadcost,
-                          int *mvsadcost[2], int error_per_bit) {
+static int mvsad_err_cost(const MV *mv, const MV *ref,
+                          const int *mvjsadcost, int *mvsadcost[2],
+                          int error_per_bit) {
   if (mvsadcost) {
-    MV v;
-    v.row = mv->as_mv.row - ref->as_mv.row;
-    v.col = mv->as_mv.col - ref->as_mv.col;
-    return ROUND_POWER_OF_TWO((mvjsadcost[vp9_get_mv_joint(&v)] +
-                               mvsadcost[0][v.row] +
-                               mvsadcost[1][v.col]) * error_per_bit, 8);
+    const MV diff = { mv->row - ref->row,
+                      mv->col - ref->col };
+    return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjsadcost, mvsadcost) *
+                                  error_per_bit, 8);
   }
   return 0;
 }
@@ -136,66 +137,26 @@ void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {
 }
 
 void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
-  int len;
-  int search_site_count = 0;
+  int len, ss_count = 1;
 
-  // Generate offsets for 8 search sites per step.
-  x->ss[search_site_count].mv.col = 0;
-  x->ss[search_site_count].mv.row = 0;
-  x->ss[search_site_count].offset = 0;
-  search_site_count++;
+  x->ss[0].mv.col = x->ss[0].mv.row = 0;
+  x->ss[0].offset = 0;
 
   for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = -len;
-    x->ss[search_site_count].offset = -len * stride;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = len;
-    x->ss[search_site_count].offset = len * stride;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = -len;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = -len;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = len;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = len;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = -len;
-    x->ss[search_site_count].mv.row = -len;
-    x->ss[search_site_count].offset = -len * stride - len;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = len;
-    x->ss[search_site_count].mv.row = -len;
-    x->ss[search_site_count].offset = -len * stride + len;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = -len;
-    x->ss[search_site_count].mv.row = len;
-    x->ss[search_site_count].offset = len * stride - len;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = len;
-    x->ss[search_site_count].mv.row = len;
-    x->ss[search_site_count].offset = len * stride + len;
-    search_site_count++;
+    // Generate offsets for 8 search sites per step.
+    const MV ss_mvs[8] = {
+      {-len,  0  }, {len,  0  }, { 0,   -len}, {0,    len},
+      {-len, -len}, {-len, len}, {len,  -len}, {len,  len}
+    };
+    int i;
+    for (i = 0; i < 8; ++i) {
+      search_site *const ss = &x->ss[ss_count++];
+      ss->mv = ss_mvs[i];
+      ss->offset = ss->mv.row * stride + ss->mv.col;
+    }
   }
 
-  x->ss_count = search_site_count;
+  x->ss_count = ss_count;
   x->searches_per_step = 8;
 }
 
@@ -313,7 +274,8 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
   }
 
 int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
-                                      int_mv *bestmv, int_mv *ref_mv,
+                                      MV *bestmv, const MV *ref_mv,
+                                      int allow_hp,
                                       int error_per_bit,
                                       const vp9_variance_fn_ptr_t *vfp,
                                       int forced_stop,
@@ -333,38 +295,34 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
   unsigned int eighthiters = iters_per_step;
   int thismse;
 
-  uint8_t *y = xd->plane[0].pre[0].buf +
-               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
-               bestmv->as_mv.col;
-
   const int y_stride = xd->plane[0].pre[0].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  uint8_t *y = xd->plane[0].pre[0].buf + offset;
 
-  int rr = ref_mv->as_mv.row;
-  int rc = ref_mv->as_mv.col;
-  int br = bestmv->as_mv.row << 3;
-  int bc = bestmv->as_mv.col << 3;
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
   int hstep = 4;
-  const int minc = MAX(x->mv_col_min << 3, ref_mv->as_mv.col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max << 3, ref_mv->as_mv.col + MV_MAX);
-  const int minr = MAX(x->mv_row_min << 3, ref_mv->as_mv.row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max << 3, ref_mv->as_mv.row + MV_MAX);
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
 
   int tr = br;
   int tc = bc;
 
-  const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
-
   // central mv
-  bestmv->as_mv.row <<= 3;
-  bestmv->as_mv.col <<= 3;
+  bestmv->row <<= 3;
+  bestmv->col <<= 3;
 
   // calculate central point error
   besterr = vfp->vf(y, y_stride, z, src_stride, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
-  // TODO: Each subsequent iteration checks at least one point in
-  // common with the last iteration could be 2 ( if diag selected)
+  // TODO(jbb): Each subsequent iteration checks at least one point in
+  // common with the last iteration could be 2 if diagonal is selected.
   while (halfiters--) {
     // 1/2 pel
     FIRST_LEVEL_CHECKS;
@@ -375,8 +333,8 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
     tc = bc;
   }
 
-  // TODO: Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  // TODO(yaowu): Each subsequent iteration checks at least one point in common
+  // with the last iteration could be 2 if diagonal is selected.
 
   // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
   if (forced_stop != 2) {
@@ -391,8 +349,7 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
     }
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
-      forced_stop == 0) {
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     while (eighthiters--) {
       FIRST_LEVEL_CHECKS;
@@ -404,18 +361,19 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
     }
   }
 
-  bestmv->as_mv.row = br;
-  bestmv->as_mv.col = bc;
+  bestmv->row = br;
+  bestmv->col = bc;
 
-  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
     return INT_MAX;
 
   return besterr;
 }
 
 int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
-                                 int_mv *bestmv, int_mv *ref_mv,
+                                 MV *bestmv, const MV *ref_mv,
+                                 int allow_hp,
                                  int error_per_bit,
                                  const vp9_variance_fn_ptr_t *vfp,
                                  int forced_stop,
@@ -424,49 +382,36 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
                                  int *distortion,
                                  unsigned int *sse1) {
   uint8_t *z = x->plane[0].src.buf;
-  int src_stride = x->plane[0].src.stride;
+  const int src_stride = x->plane[0].src.stride;
   MACROBLOCKD *xd = &x->e_mbd;
-  int rr, rc, br, bc, hstep;
-  int tr, tc;
   unsigned int besterr = INT_MAX;
   unsigned int sse;
   unsigned int whichdir;
   int thismse;
-  int maxc, minc, maxr, minr;
-  int y_stride;
-  int offset;
   unsigned int halfiters = iters_per_step;
   unsigned int quarteriters = iters_per_step;
   unsigned int eighthiters = iters_per_step;
 
-  uint8_t *y = xd->plane[0].pre[0].buf +
-               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
-               bestmv->as_mv.col;
-
-  y_stride = xd->plane[0].pre[0].stride;
-
-  rr = ref_mv->as_mv.row;
-  rc = ref_mv->as_mv.col;
-  br = bestmv->as_mv.row << 3;
-  bc = bestmv->as_mv.col << 3;
-  hstep = 4;
-  minc = MAX(x->mv_col_min << 3,
-             (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1));
-  maxc = MIN(x->mv_col_max << 3,
-             (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1));
-  minr = MAX(x->mv_row_min << 3,
-             (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1));
-  maxr = MIN(x->mv_row_max << 3,
-             (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1));
+  const int y_stride = xd->plane[0].pre[0].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  uint8_t *y = xd->plane[0].pre[0].buf + offset;
 
-  tr = br;
-  tc = bc;
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
 
-  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+  int tr = br;
+  int tc = bc;
 
   // central mv
-  bestmv->as_mv.row <<= 3;
-  bestmv->as_mv.col <<= 3;
+  bestmv->row *= 8;
+  bestmv->col *= 8;
 
   // calculate central point error
   besterr = vfp->vf(y, y_stride, z, src_stride, sse1);
@@ -492,8 +437,7 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
     tc = bc;
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
-      forced_stop == 0) {
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {
@@ -503,11 +447,11 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
     tc = bc;
   }
 
-  bestmv->as_mv.row = br;
-  bestmv->as_mv.col = bc;
+  bestmv->row = br;
+  bestmv->col = bc;
 
-  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
     return INT_MAX;
 
   return besterr;
@@ -520,7 +464,8 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
               z, src_stride, &sse, second_pred)
 
 int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
-                                           int_mv *bestmv, int_mv *ref_mv,
+                                           MV *bestmv, const MV *ref_mv,
+                                           int allow_hp,
                                            int error_per_bit,
                                            const vp9_variance_fn_ptr_t *vfp,
                                            int forced_stop,
@@ -543,30 +488,26 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
   int thismse;
 
   DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
-  uint8_t *const y = xd->plane[0].pre[0].buf +
-               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
-               bestmv->as_mv.col;
-
   const int y_stride = xd->plane[0].pre[0].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  uint8_t *const y = xd->plane[0].pre[0].buf + offset;
 
-  int rr = ref_mv->as_mv.row;
-  int rc = ref_mv->as_mv.col;
-  int br = bestmv->as_mv.row << 3;
-  int bc = bestmv->as_mv.col << 3;
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
   int hstep = 4;
-  const int minc = MAX(x->mv_col_min << 3, ref_mv->as_mv.col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max << 3, ref_mv->as_mv.col + MV_MAX);
-  const int minr = MAX(x->mv_row_min << 3, ref_mv->as_mv.row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max << 3, ref_mv->as_mv.row + MV_MAX);
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
 
   int tr = br;
   int tc = bc;
 
-  const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
-
   // central mv
-  bestmv->as_mv.row <<= 3;
-  bestmv->as_mv.col <<= 3;
+  bestmv->row *= 8;
+  bestmv->col *= 8;
 
   // calculate central point error
   // TODO(yunqingwang): central pointer error was already calculated in full-
@@ -604,8 +545,7 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
     }
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
-      forced_stop == 0) {
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     while (eighthiters--) {
       FIRST_LEVEL_CHECKS;
@@ -616,18 +556,19 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
       tc = bc;
     }
   }
-  bestmv->as_mv.row = br;
-  bestmv->as_mv.col = bc;
+  bestmv->row = br;
+  bestmv->col = bc;
 
-  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
     return INT_MAX;
 
   return besterr;
 }
 
 int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
-                                      int_mv *bestmv, int_mv *ref_mv,
+                                      MV *bestmv, const MV *ref_mv,
+                                      int allow_hp,
                                       int error_per_bit,
                                       const vp9_variance_fn_ptr_t *vfp,
                                       int forced_stop,
@@ -638,51 +579,37 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
                                       const uint8_t *second_pred,
                                       int w, int h) {
   uint8_t *z = x->plane[0].src.buf;
-  int src_stride = x->plane[0].src.stride;
+  const int src_stride = x->plane[0].src.stride;
   MACROBLOCKD *xd = &x->e_mbd;
-  int rr, rc, br, bc, hstep;
-  int tr, tc;
   unsigned int besterr = INT_MAX;
   unsigned int sse;
   unsigned int whichdir;
   int thismse;
-  int maxc, minc, maxr, minr;
-  int y_stride;
-  int offset;
   unsigned int halfiters = iters_per_step;
   unsigned int quarteriters = iters_per_step;
   unsigned int eighthiters = iters_per_step;
 
   DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
-  uint8_t *y = xd->plane[0].pre[0].buf +
-               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
-               bestmv->as_mv.col;
-
-  y_stride = xd->plane[0].pre[0].stride;
-
-  rr = ref_mv->as_mv.row;
-  rc = ref_mv->as_mv.col;
-  br = bestmv->as_mv.row << 3;
-  bc = bestmv->as_mv.col << 3;
-  hstep = 4;
-  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
-             ((1 << MV_MAX_BITS) - 1));
-  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
-             ((1 << MV_MAX_BITS) - 1));
-  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
-             ((1 << MV_MAX_BITS) - 1));
-  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
-             ((1 << MV_MAX_BITS) - 1));
-
-  tr = br;
-  tc = bc;
+  const int y_stride = xd->plane[0].pre[0].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  uint8_t *y = xd->plane[0].pre[0].buf + offset;
 
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
 
-  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+  int tr = br;
+  int tc = bc;
 
   // central mv
-  bestmv->as_mv.row <<= 3;
-  bestmv->as_mv.col <<= 3;
+  bestmv->row *= 8;
+  bestmv->col *= 8;
 
   // calculate central point error
   // TODO(yunqingwang): central pointer error was already calculated in full-
@@ -716,8 +643,7 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
     tc = bc;
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
-      forced_stop == 0) {
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {
@@ -726,11 +652,11 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
     tr = br;
     tc = bc;
   }
-  bestmv->as_mv.row = br;
-  bestmv->as_mv.col = bc;
+  bestmv->row = br;
+  bestmv->col = bc;
 
-  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
     return INT_MAX;
 
   return besterr;
@@ -754,10 +680,10 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
 
 #define CHECK_POINT \
   {\
-    if (this_mv.as_mv.col < x->mv_col_min) continue;\
-    if (this_mv.as_mv.col > x->mv_col_max) continue;\
-    if (this_mv.as_mv.row < x->mv_row_min) continue;\
-    if (this_mv.as_mv.row > x->mv_row_max) continue;\
+    if (this_mv.col < x->mv_col_min) continue;\
+    if (this_mv.col > x->mv_col_max) continue;\
+    if (this_mv.row < x->mv_row_min) continue;\
+    if (this_mv.row > x->mv_row_max) continue;\
   }
 
 #define CHECK_BETTER \
@@ -765,7 +691,7 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
     if (thissad < bestsad)\
     {\
       if (use_mvcost) \
-        thissad += mvsad_err_cost(&this_mv, &fcenter_mv, \
+        thissad += mvsad_err_cost(&this_mv, &fcenter_mv.as_mv, \
                                   mvjsadcost, mvsadcost, \
                                   sad_per_bit);\
       if (thissad < bestsad)\
@@ -790,14 +716,14 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
 static int vp9_pattern_search(MACROBLOCK *x,
-                              int_mv *ref_mv,
+                              MV *ref_mv,
                               int search_param,
                               int sad_per_bit,
                               int do_init_search,
                               int do_refine,
                               const vp9_variance_fn_ptr_t *vfp,
                               int use_mvcost,
-                              int_mv *center_mv, int_mv *best_mv,
+                              const MV *center_mv, MV *best_mv,
                               const int num_candidates[MAX_PATTERN_SCALES],
                               const MV candidates[MAX_PATTERN_SCALES]
                                                  [MAX_PATTERN_CANDIDATES]) {
@@ -810,7 +736,7 @@ static int vp9_pattern_search(MACROBLOCK *x,
   int what_stride = x->plane[0].src.stride;
   int in_what_stride = xd->plane[0].pre[0].stride;
   int br, bc;
-  int_mv this_mv;
+  MV this_mv;
   int bestsad = INT_MAX;
   int thissad;
   uint8_t *base_offset;
@@ -823,24 +749,22 @@ static int vp9_pattern_search(MACROBLOCK *x,
   int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+  fcenter_mv.as_mv.row = center_mv->row >> 3;
+  fcenter_mv.as_mv.col = center_mv->col >> 3;
 
   // adjust ref_mv to make sure it is within MV range
-  clamp_mv(&ref_mv->as_mv,
-           x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  br = ref_mv->as_mv.row;
-  bc = ref_mv->as_mv.col;
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  br = ref_mv->row;
+  bc = ref_mv->col;
 
   // Work out the start point for the search
   base_offset = (uint8_t *)(xd->plane[0].pre[0].buf);
   this_offset = base_offset + (br * in_what_stride) + bc;
-  this_mv.as_mv.row = br;
-  this_mv.as_mv.col = bc;
-  bestsad = vfp->sdf(what, what_stride, this_offset,
-                     in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost,
-                             sad_per_bit);
+  this_mv.row = br;
+  this_mv.col = bc;
+  bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff)
+                + mvsad_err_cost(&this_mv, &fcenter_mv.as_mv,
+                                 mvjsadcost, mvsadcost, sad_per_bit);
 
   // Search all possible scales upto the search param around the center point
   // pick the scale of the point that is best as the starting scale of
@@ -853,21 +777,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
       CHECK_BOUNDS((1 << t))
       if (all_in) {
         for (i = 0; i < num_candidates[t]; i++) {
-          this_mv.as_mv.row = br + candidates[t][i].row;
-          this_mv.as_mv.col = bc + candidates[t][i].col;
-          this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
-              this_mv.as_mv.col;
+          this_mv.row = br + candidates[t][i].row;
+          this_mv.col = bc + candidates[t][i].col;
+          this_offset = base_offset + (this_mv.row * in_what_stride) +
+                                       this_mv.col;
           thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                              bestsad);
           CHECK_BETTER
         }
       } else {
         for (i = 0; i < num_candidates[t]; i++) {
-          this_mv.as_mv.row = br + candidates[t][i].row;
-          this_mv.as_mv.col = bc + candidates[t][i].col;
+          this_mv.row = br + candidates[t][i].row;
+          this_mv.col = bc + candidates[t][i].col;
           CHECK_POINT
-          this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
-                        this_mv.as_mv.col;
+          this_offset = base_offset + (this_mv.row * in_what_stride) +
+                                       this_mv.col;
           thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                              bestsad);
           CHECK_BETTER
@@ -897,21 +821,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
         CHECK_BOUNDS((1 << s))
         if (all_in) {
           for (i = 0; i < num_candidates[s]; i++) {
-            this_mv.as_mv.row = br + candidates[s][i].row;
-            this_mv.as_mv.col = bc + candidates[s][i].col;
-            this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
-                this_mv.as_mv.col;
+            this_mv.row = br + candidates[s][i].row;
+            this_mv.col = bc + candidates[s][i].col;
+            this_offset = base_offset + (this_mv.row * in_what_stride) +
+                                         this_mv.col;
             thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                                bestsad);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < num_candidates[s]; i++) {
-            this_mv.as_mv.row = br + candidates[s][i].row;
-            this_mv.as_mv.col = bc + candidates[s][i].col;
+            this_mv.row = br + candidates[s][i].row;
+            this_mv.col = bc + candidates[s][i].col;
             CHECK_POINT
-            this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
-                          this_mv.as_mv.col;
+            this_offset = base_offset + (this_mv.row * in_what_stride) +
+                                         this_mv.col;
             thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                                bestsad);
             CHECK_BETTER
@@ -935,25 +859,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
         get_next_chkpts(next_chkpts_indices, k, num_candidates[s]);
         if (all_in) {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            this_mv.as_mv.row = br +
-                candidates[s][next_chkpts_indices[i]].row;
-            this_mv.as_mv.col = bc +
-                candidates[s][next_chkpts_indices[i]].col;
-            this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
-                this_mv.as_mv.col;
+            this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
+            this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col;
+            this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+                                         this_mv.col;
             thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                                bestsad);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            this_mv.as_mv.row = br +
-                candidates[s][next_chkpts_indices[i]].row;
-            this_mv.as_mv.col = bc +
-                candidates[s][next_chkpts_indices[i]].col;
+            this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
+            this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col;
             CHECK_POINT
-            this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
-                          this_mv.as_mv.col;
+            this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+                                         this_mv.col;
             thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                                bestsad);
             CHECK_BETTER
@@ -980,21 +900,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
       CHECK_BOUNDS(1)
       if (all_in) {
         for (i = 0; i < 4; i++) {
-          this_mv.as_mv.row = br + neighbors[i].row;
-          this_mv.as_mv.col = bc + neighbors[i].col;
-          this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
-              this_mv.as_mv.col;
+          this_mv.row = br + neighbors[i].row;
+          this_mv.col = bc + neighbors[i].col;
+          this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+                                       this_mv.col;
           thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                              bestsad);
           CHECK_BETTER
         }
       } else {
         for (i = 0; i < 4; i++) {
-          this_mv.as_mv.row = br + neighbors[i].row;
-          this_mv.as_mv.col = bc + neighbors[i].col;
+          this_mv.row = br + neighbors[i].row;
+          this_mv.col = bc + neighbors[i].col;
           CHECK_POINT
-          this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
-                        this_mv.as_mv.col;
+          this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+                                       this_mv.col;
           thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
                              bestsad);
           CHECK_BETTER
@@ -1010,31 +930,32 @@ static int vp9_pattern_search(MACROBLOCK *x,
     }
   }
 
-  best_mv->as_mv.row = br;
-  best_mv->as_mv.col = bc;
+  best_mv->row = br;
+  best_mv->col = bc;
 
-  this_offset = base_offset + (best_mv->as_mv.row * (in_what_stride)) +
-      best_mv->as_mv.col;
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+  this_offset = base_offset + (best_mv->row * in_what_stride) +
+                               best_mv->col;
+  this_mv.row = best_mv->row * 8;
+  this_mv.col = best_mv->col * 8;
   if (bestsad == INT_MAX)
     return INT_MAX;
-  return
-      vfp->vf(what, what_stride, this_offset, in_what_stride,
-              (unsigned int *)(&bestsad)) +
-      use_mvcost ? mv_err_cost(&this_mv, center_mv, x->nmvjointcost, x->mvcost,
-                               x->errorperbit) : 0;
+
+  return vfp->vf(what, what_stride, this_offset, in_what_stride,
+                 (unsigned int *)&bestsad) +
+         use_mvcost ? mv_err_cost(&this_mv, center_mv,
+                                  x->nmvjointcost, x->mvcost, x->errorperbit)
+                    : 0;
 }
 
 
 int vp9_hex_search(MACROBLOCK *x,
-                   int_mv *ref_mv,
+                   MV *ref_mv,
                    int search_param,
                    int sad_per_bit,
                    int do_init_search,
                    const vp9_variance_fn_ptr_t *vfp,
                    int use_mvcost,
-                   int_mv *center_mv, int_mv *best_mv) {
+                   const MV *center_mv, MV *best_mv) {
   // First scale has 8-closest points, the rest have 6 points in hex shape
   // at increasing scales
   static const int hex_num_candidates[MAX_PATTERN_SCALES] = {
@@ -1063,14 +984,14 @@ int vp9_hex_search(MACROBLOCK *x,
 }
 
 int vp9_bigdia_search(MACROBLOCK *x,
-                      int_mv *ref_mv,
+                      MV *ref_mv,
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
-                      int_mv *center_mv,
-                      int_mv *best_mv) {
+                      const MV *center_mv,
+                      MV *best_mv) {
   // First scale has 4-closest points, the rest have 8 points in diamond
   // shape at increasing scales
   static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
@@ -1097,22 +1018,21 @@ int vp9_bigdia_search(MACROBLOCK *x,
     {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024},
       {-512, 512}, {-1024, 0}},
   };
-  return
-      vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                         do_init_search, 0, vfp, use_mvcost,
-                         center_mv, best_mv,
-                         bigdia_num_candidates, bigdia_candidates);
+  return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                            do_init_search, 0, vfp, use_mvcost,
+                            center_mv, best_mv,
+                            bigdia_num_candidates, bigdia_candidates);
 }
 
 int vp9_square_search(MACROBLOCK *x,
-                      int_mv *ref_mv,
+                      MV *ref_mv,
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
-                      int_mv *center_mv,
-                      int_mv *best_mv) {
+                      const MV *center_mv,
+                      MV *best_mv) {
   // All scales have 8 closest points in square shape
   static const int square_num_candidates[MAX_PATTERN_SCALES] = {
     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
@@ -1139,11 +1059,10 @@ int vp9_square_search(MACROBLOCK *x,
     {{-1024, -1024}, {0, -1024}, {1024, -1024}, {1024, 0}, {1024, 1024},
       {0, 1024}, {-1024, 1024}, {-1024, 0}},
   };
-  return
-      vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                         do_init_search, 0, vfp, use_mvcost,
-                         center_mv, best_mv,
-                         square_num_candidates, square_candidates);
+  return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                            do_init_search, 0, vfp, use_mvcost,
+                            center_mv, best_mv,
+                            square_num_candidates, square_candidates);
 };
 
 #undef CHECK_BOUNDS
@@ -1199,13 +1118,14 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
   best_address = in_what;
 
   // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what,
-                        in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
-                             sad_per_bit);
+  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
+                + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+                                 mvjsadcost, mvsadcost, sad_per_bit);
 
-  // search_param determines the length of the initial step and hence the number of iterations
-  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+  // search_param determines the length of the initial step and hence the number
+  // of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+  // (MAX_FIRST_STEP/4) pel... etc.
   ss = &x->ss[search_param * x->searches_per_step];
   tot_steps = (x->ss_count / x->searches_per_step) - search_param;
 
@@ -1228,7 +1148,7 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
         if (thissad < bestsad) {
           this_mv.as_mv.row = this_row_offset;
           this_mv.as_mv.col = this_col_offset;
-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                     mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
@@ -1260,7 +1180,7 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
           if (thissad < bestsad) {
             this_mv.as_mv.row = this_row_offset;
             this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
             if (thissad < bestsad) {
               bestsad = thissad;
@@ -1274,19 +1194,21 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
         break;
       };
 #endif
-    } else if (best_address == in_what)
+    } else if (best_address == in_what) {
       (*num00)++;
+    }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+  this_mv.as_mv.row = best_mv->as_mv.row * 8;
+  this_mv.as_mv.col = best_mv->as_mv.col * 8;
 
   if (bestsad == INT_MAX)
     return INT_MAX;
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-         (unsigned int *)(&thissad)) + mv_err_cost(&this_mv, center_mv, mvjcost,
-                                                   mvcost, x->errorperbit);
+                    (unsigned int *)(&thissad)) +
+                       mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                   mvjcost, mvcost, x->errorperbit);
 }
 
 int vp9_diamond_search_sadx4(MACROBLOCK *x,
@@ -1340,13 +1262,15 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
   best_address = in_what;
 
   // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride,
-                        in_what, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
-                             sad_per_bit);
+  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
+                + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+                                 mvjsadcost, mvsadcost, sad_per_bit);
 
-  // search_param determines the length of the initial step and hence the number of iterations
-  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
   ss = &x->ss[search_param * x->searches_per_step];
   tot_steps = (x->ss_count / x->searches_per_step) - search_param;
 
@@ -1355,13 +1279,16 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
   for (step = 0; step < tot_steps; step++) {
     int all_in = 1, t;
 
-    // To know if all neighbor points are within the bounds, 4 bounds checking are enough instead of
-    // checking 4 bounds for each points.
+    // All_in is true if every one of the points we are checking are within
+    // the bounds of the image.
     all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min);
     all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max);
     all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min);
     all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max);
 
+    // If all the pixels are within the bounds we don't check whether the
+    // search point is valid in this loop,  otherwise we check each point
+    // for validity..
     if (all_in) {
       unsigned int sad_array[4];
 
@@ -1378,7 +1305,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
           if (sad_array[t] < bestsad) {
             this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;
             this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;
-            sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv,
+            sad_array[t] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                            mvjsadcost, mvsadcost, sad_per_bit);
 
             if (sad_array[t] < bestsad) {
@@ -1394,15 +1321,18 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
         this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
         this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
 
-        if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
+        if ((this_col_offset > x->mv_col_min) &&
+            (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) &&
+            (this_row_offset < x->mv_row_max)) {
           check_here = ss[i].offset + best_address;
-          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                                bestsad);
 
           if (thissad < bestsad) {
             this_mv.as_mv.row = this_row_offset;
             this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
 
             if (thissad < bestsad) {
@@ -1433,7 +1363,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
           if (thissad < bestsad) {
             this_mv.as_mv.row = this_row_offset;
             this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
             if (thissad < bestsad) {
               bestsad = thissad;
@@ -1447,19 +1377,21 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
         break;
       };
 #endif
-    } else if (best_address == in_what)
+    } else if (best_address == in_what) {
       (*num00)++;
+    }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+  this_mv.as_mv.row = best_mv->as_mv.row * 8;
+  this_mv.as_mv.col = best_mv->as_mv.col * 8;
 
   if (bestsad == INT_MAX)
     return INT_MAX;
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                    (unsigned int *)(&thissad)) + mv_err_cost(&this_mv,
-                            center_mv, mvjcost, mvcost, x->errorperbit);
+                    (unsigned int *)(&thissad)) +
+                    mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                mvjcost, mvcost, x->errorperbit);
 }
 
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
@@ -1482,16 +1414,17 @@ int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
   n = num00;
   num00 = 0;
 
-  /* If there won't be more n-step search, check to see if refining search is needed. */
+  /* If there won't be more n-step search, check to see if refining search is
+   * needed. */
   if (n > further_steps)
     do_refine = 0;
 
   while (n < further_steps) {
     n++;
 
-    if (num00)
+    if (num00) {
       num00--;
-    else {
+    } else {
       thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
                                         step_param + n, sadpb, &num00,
                                         fn_ptr, x->nmvjointcost, x->mvcost,
@@ -1570,8 +1503,8 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
   // Baseline value at the centre
   bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
                         in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
-                             sad_per_bit);
+                           + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+                                            mvjsadcost, mvsadcost, sad_per_bit);
 
   // Apply further limits to prevent us looking using vectors that stretch
   // beyond the UMV border
@@ -1585,11 +1518,12 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
     check_here = r * mv_stride + in_what + col_min;
 
     for (c = col_min; c < col_max; c++) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                            bestsad);
 
       this_mv.as_mv.col = c;
-      thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                 mvjsadcost, mvsadcost, sad_per_bit);
+      thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                mvjsadcost, mvsadcost, sad_per_bit);
 
       if (thissad < bestsad) {
         bestsad = thissad;
@@ -1602,14 +1536,14 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+  this_mv.as_mv.row = best_mv->as_mv.row * 8;
+  this_mv.as_mv.col = best_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+                      (unsigned int *)(&thissad)) +
+                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                  mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -1660,8 +1594,8 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
   // Baseline value at the centre
   bestsad = fn_ptr->sdf(what, what_stride,
                         bestaddress, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
-                             sad_per_bit);
+            + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+                             mvjsadcost, mvsadcost, sad_per_bit);
 
   // Apply further limits to prevent us looking using vectors that stretch
   // beyond the UMV border
@@ -1685,8 +1619,8 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
 
         if (thissad < bestsad) {
           this_mv.as_mv.col = c;
-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                     mvjsadcost, mvsadcost, sad_per_bit);
+          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                    mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
             bestsad = thissad;
@@ -1702,11 +1636,12 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
     }
 
     while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                            bestsad);
 
       if (thissad < bestsad) {
         this_mv.as_mv.col = c;
-        thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+        thissad  += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                    mvjsadcost, mvsadcost, sad_per_bit);
 
         if (thissad < bestsad) {
@@ -1720,17 +1655,16 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
       check_here++;
       c++;
     }
-
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+  this_mv.as_mv.row = best_mv->as_mv.row * 8;
+  this_mv.as_mv.col = best_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+                      (unsigned int *)(&thissad)) +
+                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                  mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -1783,8 +1717,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
   // Baseline value at the centre
   bestsad = fn_ptr->sdf(what, what_stride,
                         bestaddress, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
-                             sad_per_bit);
+            + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+                             mvjsadcost, mvsadcost, sad_per_bit);
 
   // Apply further limits to prevent us looking using vectors that stretch
   // beyond the UMV border
@@ -1808,8 +1742,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
 
         if (thissad < bestsad) {
           this_mv.as_mv.col = c;
-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                     mvjsadcost, mvsadcost, sad_per_bit);
+          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                    mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
             bestsad = thissad;
@@ -1834,7 +1768,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
 
         if (thissad < bestsad) {
           this_mv.as_mv.col = c;
-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+          thissad  += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
                                      mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
@@ -1851,12 +1785,13 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
     }
 
     while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                            bestsad);
 
       if (thissad < bestsad) {
         this_mv.as_mv.col = c;
-        thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                   mvjsadcost, mvsadcost, sad_per_bit);
+        thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                  mvjsadcost, mvsadcost, sad_per_bit);
 
         if (thissad < bestsad) {
           bestsad = thissad;
@@ -1871,14 +1806,14 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+  this_mv.as_mv.row = best_mv->as_mv.row * 8;
+  this_mv.as_mv.col = best_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+                      (unsigned int *)(&thissad)) +
+                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                  mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -1909,8 +1844,10 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
   fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-  bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
-      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+  bestsad = fn_ptr->sdf(what, what_stride, best_address,
+                        in_what_stride, 0x7fffffff) +
+                        mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
+                                       mvjsadcost, mvsadcost, error_per_bit);
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
@@ -1919,16 +1856,20 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
       this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
       this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
 
-      if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-          (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
-        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
-        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+      if ((this_col_offset > x->mv_col_min) &&
+          (this_col_offset < x->mv_col_max) &&
+          (this_row_offset > x->mv_row_min) &&
+          (this_row_offset < x->mv_row_max)) {
+        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+                     best_address;
+        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                              bestsad);
 
         if (thissad < bestsad) {
           this_mv.as_mv.row = this_row_offset;
           this_mv.as_mv.col = this_col_offset;
-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
-                                    mvsadcost, error_per_bit);
+          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                    mvjsadcost, mvsadcost, error_per_bit);
 
           if (thissad < bestsad) {
             bestsad = thissad;
@@ -1938,23 +1879,24 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
       }
     }
 
-    if (best_site == -1)
+    if (best_site == -1) {
       break;
-    else {
+    } else {
       ref_mv->as_mv.row += neighbors[best_site].row;
       ref_mv->as_mv.col += neighbors[best_site].col;
-      best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
+      best_address += (neighbors[best_site].row) * in_what_stride +
+                      neighbors[best_site].col;
     }
   }
 
-  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
-  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+  this_mv.as_mv.row = ref_mv->as_mv.row * 8;
+  this_mv.as_mv.col = ref_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+                      (unsigned int *)(&thissad)) +
+                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                  mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -1986,8 +1928,10 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
   fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-  bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
-      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+  bestsad = fn_ptr->sdf(what, what_stride, best_address,
+                        in_what_stride, 0x7fffffff) +
+      mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
+                     mvjsadcost, mvsadcost, error_per_bit);
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
@@ -2004,14 +1948,15 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
       block_offset[2] = best_address + 1;
       block_offset[3] = best_address + in_what_stride;
 
-      fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+      fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                     sad_array);
 
       for (j = 0; j < 4; j++) {
         if (sad_array[j] < bestsad) {
           this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;
           this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;
-          sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
-                                         mvsadcost, error_per_bit);
+          sad_array[j] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                         mvjsadcost, mvsadcost, error_per_bit);
 
           if (sad_array[j] < bestsad) {
             bestsad = sad_array[j];
@@ -2024,16 +1969,20 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
         this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
         this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
 
-        if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
-          check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
-          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+        if ((this_col_offset > x->mv_col_min) &&
+            (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) &&
+            (this_row_offset < x->mv_row_max)) {
+          check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+                       best_address;
+          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                                bestsad);
 
           if (thissad < bestsad) {
             this_mv.as_mv.row = this_row_offset;
             this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
-                                      mvsadcost, error_per_bit);
+            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                      mvjsadcost, mvsadcost, error_per_bit);
 
             if (thissad < bestsad) {
               bestsad = thissad;
@@ -2044,23 +1993,24 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
       }
     }
 
-    if (best_site == -1)
+    if (best_site == -1) {
       break;
-    else {
+    } else {
       ref_mv->as_mv.row += neighbors[best_site].row;
       ref_mv->as_mv.col += neighbors[best_site].col;
-      best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
+      best_address += (neighbors[best_site].row) * in_what_stride +
+                      neighbors[best_site].col;
     }
   }
 
-  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
-  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+  this_mv.as_mv.row = ref_mv->as_mv.row * 8;
+  this_mv.as_mv.col = ref_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+                      (unsigned int *)(&thissad)) +
+                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                  mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -2100,7 +2050,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
   /* Get compound pred by averaging two pred blocks. */
   bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride,
                          second_pred, 0x7fffffff) +
-      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+      mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
+                     mvjsadcost, mvsadcost, error_per_bit);
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
@@ -2123,9 +2074,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
         if (thissad < bestsad) {
           this_mv.as_mv.row = this_row_offset;
           this_mv.as_mv.col = this_col_offset;
-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
-                                    mvsadcost, error_per_bit);
-
+          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+                                    mvjsadcost, mvsadcost, error_per_bit);
           if (thissad < bestsad) {
             bestsad = thissad;
             best_site = j;
@@ -2144,16 +2094,16 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
     }
   }
 
-  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
-  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+  this_mv.as_mv.row = ref_mv->as_mv.row * 8;
+  this_mv.as_mv.col = ref_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX) {
     // FIXME(rbultje, yunqing): add full-pixel averaging variance functions
     // so we don't have to use the subpixel with xoff=0,yoff=0 here.
-    return fn_ptr->svaf(best_address, in_what_stride, 0, 0,
-                               what, what_stride, (unsigned int *)(&thissad),
-                               second_pred) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+    return fn_ptr->svaf(best_address, in_what_stride, 0, 0, what, what_stride,
+                        (unsigned int *)(&thissad), second_pred) +
+                        mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                                    mvjcost, mvcost, x->errorperbit);
   } else {
     return INT_MAX;
   }
diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h
index 3598fa0..bcab679 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/libvpx/vp9/encoder/vp9_mcomp.h
@@ -22,10 +22,14 @@
 #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)
 // Maximum size of the first step in full pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
+// Allowed motion vector pixel distance outside image border
+// for Block_16x16
+#define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND)
+
 
 void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv);
-int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,
-                           int *mvcost[2], int weight);
+int vp9_mv_bit_cost(const MV *mv, const MV *ref,
+                    const int *mvjcost, int *mvcost[2], int weight);
 void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
 void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);
 
@@ -40,37 +44,37 @@ int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
                            int_mv *ref_mv, int_mv *dst_mv);
 
 int vp9_hex_search(MACROBLOCK *x,
-                   int_mv *ref_mv,
+                   MV *ref_mv,
                    int search_param,
                    int error_per_bit,
                    int do_init_search,
                    const vp9_variance_fn_ptr_t *vf,
                    int use_mvcost,
-                   int_mv *center_mv,
-                   int_mv *best_mv);
+                   const MV *center_mv,
+                   MV *best_mv);
 int vp9_bigdia_search(MACROBLOCK *x,
-                      int_mv *ref_mv,
+                      MV *ref_mv,
                       int search_param,
                       int error_per_bit,
                       int do_init_search,
                       const vp9_variance_fn_ptr_t *vf,
                       int use_mvcost,
-                      int_mv *center_mv,
-                      int_mv *best_mv);
+                      const MV *center_mv,
+                      MV *best_mv);
 int vp9_square_search(MACROBLOCK *x,
-                      int_mv *ref_mv,
+                      MV *ref_mv,
                       int search_param,
                       int error_per_bit,
                       int do_init_search,
                       const vp9_variance_fn_ptr_t *vf,
                       int use_mvcost,
-                      int_mv *center_mv,
-                      int_mv *best_mv);
+                      const MV *center_mv,
+                      MV *best_mv);
 
 typedef int (fractional_mv_step_fp) (
     MACROBLOCK *x,
-    int_mv *bestmv,
-    int_mv *ref_mv,
+    MV *bestmv, const MV *ref_mv,
+    int allow_hp,
     int error_per_bit,
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
@@ -84,7 +88,8 @@ extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
 
 typedef int (fractional_mv_step_comp_fp) (
     MACROBLOCK *x,
-    int_mv *bestmv, int_mv *ref_mv,
+    MV *bestmv, const MV *ref_mv,
+    int allow_hp,
     int error_per_bit,
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
diff --git a/libvpx/vp9/encoder/vp9_modecosts.c b/libvpx/vp9/encoder/vp9_modecosts.c
index a5dfaed..7eb6592 100644
--- a/libvpx/vp9/encoder/vp9_modecosts.c
+++ b/libvpx/vp9/encoder/vp9_modecosts.c
@@ -17,7 +17,7 @@
 
 void vp9_init_mode_costs(VP9_COMP *c) {
   VP9_COMMON *const cm = &c->common;
-  const vp9_tree_p KT = vp9_intra_mode_tree;
+  const vp9_tree_index *KT = vp9_intra_mode_tree;
   int i, j;
 
   for (i = 0; i < INTRA_MODES; i++) {
@@ -36,7 +36,7 @@ void vp9_init_mode_costs(VP9_COMP *c) {
                   vp9_kf_uv_mode_prob[INTRA_MODES - 1],
                   vp9_intra_mode_tree);
 
-  for (i = 0; i <= SWITCHABLE_FILTERS; ++i)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
                     cm->fc.switchable_interp_prob[i],
                     vp9_switchable_interp_tree);
diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c
index 883b31e..f922f90 100644
--- a/libvpx/vp9/encoder/vp9_onyx_if.c
+++ b/libvpx/vp9/encoder/vp9_onyx_if.c
@@ -8,44 +8,35 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
 
-#include "vpx_config.h"
+#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_filter.h"
-#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_idct.h"
+#if CONFIG_VP9_POSTPROC
+#include "vp9/common/vp9_postproc.h"
+#endif
 #include "vp9/common/vp9_reconinter.h"
-#include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_alloccommon.h"
-#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/common/vp9_tile_common.h"
 #include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mbgraph.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_picklpf.h"
 #include "vp9/encoder/vp9_psnr.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_extend.h"
 #include "vp9/encoder/vp9_ratectrl.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vp9/common/vp9_tile_common.h"
-#include "vp9/encoder/vp9_segmentation.h"
-#include "./vp9_rtcd.h"
-#include "./vpx_scale_rtcd.h"
-#if CONFIG_VP9_POSTPROC
-#include "vp9/common/vp9_postproc.h"
-#endif
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/vpx_timer.h"
-
-#include "vp9/common/vp9_seg_common.h"
-#include "vp9/encoder/vp9_mbgraph.h"
-#include "vp9/common/vp9_pred_common.h"
 #include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_bitstream.h"
-#include "vp9/encoder/vp9_picklpf.h"
-#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_vaq.h"
+
+#include "vpx_ports/vpx_timer.h"
 
-#include <math.h>
-#include <stdio.h>
-#include <limits.h>
 
 extern void print_tree_update_probs();
 
@@ -55,16 +46,20 @@ static void set_default_lf_deltas(struct loopfilter *lf);
 
 #define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */
 
-#define ALTREF_HIGH_PRECISION_MV 1      /* whether to use high precision mv
-                                           for altref computation */
-#define HIGH_PRECISION_MV_QTHRESH 200   /* Q threshold for use of high precision
-                                           mv. Choose a very high value for
-                                           now so that HIGH_PRECISION is always
-                                           chosen */
+#define ALTREF_HIGH_PRECISION_MV 1      // Whether to use high precision mv
+                                         //  for altref computation.
+#define HIGH_PRECISION_MV_QTHRESH 200   // Q threshold for high precision
+                                         // mv. Choose a very high value for
+                                         // now so that HIGH_PRECISION is always
+                                         // chosen.
 
-#if CONFIG_INTERNAL_STATS
-#include "math.h"
+// Masks for partially or completely disabling split mode
+#define DISABLE_ALL_SPLIT         0x3F
+#define DISABLE_ALL_INTER_SPLIT   0x1F
+#define DISABLE_COMPOUND_SPLIT    0x18
+#define LAST_AND_INTRA_SPLIT_ONLY 0x1E
 
+#if CONFIG_INTERNAL_STATS
 extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,
                             YV12_BUFFER_CONFIG *dest, int lumamask,
                             double *weight);
@@ -107,7 +102,8 @@ extern void write_switchable_interp_stats();
 #endif
 
 #ifdef SPEEDSTATS
-unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0};
 #endif
 
 #if defined(SECTIONBITS_OUTPUT)
@@ -122,6 +118,8 @@ static int kf_high_motion_minq[QINDEX_RANGE];
 static int gf_low_motion_minq[QINDEX_RANGE];
 static int gf_high_motion_minq[QINDEX_RANGE];
 static int inter_minq[QINDEX_RANGE];
+static int afq_low_motion_minq[QINDEX_RANGE];
+static int afq_high_motion_minq[QINDEX_RANGE];
 
 static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
   switch (mode) {
@@ -193,24 +191,55 @@ static void init_minq_luts(void) {
     gf_low_motion_minq[i] = calculate_minq_index(maxq,
                                                  0.0000015,
                                                  -0.0009,
-                                                 0.33,
+                                                 0.32,
                                                  0.0);
     gf_high_motion_minq[i] = calculate_minq_index(maxq,
                                                   0.0000021,
                                                   -0.00125,
-                                                  0.45,
+                                                  0.50,
                                                   0.0);
     inter_minq[i] = calculate_minq_index(maxq,
                                          0.00000271,
                                          -0.00113,
-                                         0.697,
+                                         0.75,
                                          0.0);
+    afq_low_motion_minq[i] = calculate_minq_index(maxq,
+                                                  0.0000015,
+                                                  -0.0009,
+                                                  0.33,
+                                                  0.0);
+    afq_high_motion_minq[i] = calculate_minq_index(maxq,
+                                                   0.0000021,
+                                                   -0.00125,
+                                                   0.55,
+                                                   0.0);
+  }
+}
 
+static int get_active_quality(int q,
+                              int gfu_boost,
+                              int low,
+                              int high,
+                              int *low_motion_minq,
+                              int *high_motion_minq) {
+  int active_best_quality;
+  if (gfu_boost > high) {
+    active_best_quality = low_motion_minq[q];
+  } else if (gfu_boost < low) {
+    active_best_quality = high_motion_minq[q];
+  } else {
+    const int gap = high - low;
+    const int offset = high - gfu_boost;
+    const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+    const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+    active_best_quality = low_motion_minq[q] + adjustment;
   }
+  return active_best_quality;
 }
 
-static void set_mvcost(MACROBLOCK *mb) {
-  if (mb->e_mbd.allow_high_precision_mv) {
+static void set_mvcost(VP9_COMP *cpi) {
+  MACROBLOCK *const mb = &cpi->mb;
+  if (cpi->common.allow_high_precision_mv) {
     mb->mvcost = mb->nmvcost_hp;
     mb->mvsadcost = mb->nmvsadcost_hp;
   } else {
@@ -284,14 +313,17 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vpx_free(cpi->mb_norm_activity_map);
   cpi->mb_norm_activity_map = 0;
 
-  vpx_free(cpi->mb.pip);
-  cpi->mb.pip = 0;
+  vpx_free(cpi->above_context[0]);
+  cpi->above_context[0] = NULL;
+
+  vpx_free(cpi->above_seg_context);
+  cpi->above_seg_context = NULL;
 }
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
 // to a target value
 // target q value
-static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
+int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
   int i;
   int start_index = cpi->worst_quality;
   int target_index = cpi->worst_quality;
@@ -355,7 +387,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
       seg->update_map = 1;
       seg->update_data = 1;
 
-      qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));
+      qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));
       vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
       vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
 
@@ -364,7 +396,6 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
 
       // Where relevant assume segment data is delta data
       seg->abs_delta = SEGMENT_DELTADATA;
-
     }
   } else if (seg->enabled) {
     // All other frames if segmentation has been enabled
@@ -377,8 +408,8 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
         seg->update_data = 1;
         seg->abs_delta = SEGMENT_DELTADATA;
 
-        qi_delta = compute_qdelta(cpi, cpi->avg_q,
-                                  (cpi->avg_q * 1.125));
+        qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q,
+                                      (cpi->avg_q * 1.125));
         vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
         vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
 
@@ -421,8 +452,8 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
 
       // Skip all MBs if high Q (0,0 mv and skip coeffs)
       if (high_q) {
-          vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP);
-          vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+        vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+        vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
       }
       // Enable data update
       seg->update_data = 1;
@@ -565,16 +596,16 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
   sf->thresh_mult[THR_NEARESTG] = 0;
   sf->thresh_mult[THR_NEARESTA] = 0;
 
-  sf->thresh_mult[THR_NEWMV] += 1000;
-  sf->thresh_mult[THR_COMP_NEARESTLA] += 1000;
-  sf->thresh_mult[THR_NEARMV] += 1000;
-  sf->thresh_mult[THR_COMP_NEARESTGA] += 1000;
-
   sf->thresh_mult[THR_DC] += 1000;
 
-  sf->thresh_mult[THR_NEWG] += 1000;
+  sf->thresh_mult[THR_NEWMV] += 1000;
   sf->thresh_mult[THR_NEWA] += 1000;
+  sf->thresh_mult[THR_NEWG] += 1000;
+
+  sf->thresh_mult[THR_NEARMV] += 1000;
   sf->thresh_mult[THR_NEARA] += 1000;
+  sf->thresh_mult[THR_COMP_NEARESTLA] += 1000;
+  sf->thresh_mult[THR_COMP_NEARESTGA] += 1000;
 
   sf->thresh_mult[THR_TM] += 1000;
 
@@ -584,19 +615,12 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
   sf->thresh_mult[THR_COMP_NEARGA] += 1500;
   sf->thresh_mult[THR_COMP_NEWGA] += 2000;
 
-  sf->thresh_mult[THR_SPLITMV] += 2500;
-  sf->thresh_mult[THR_SPLITG] += 2500;
-  sf->thresh_mult[THR_SPLITA] += 2500;
-  sf->thresh_mult[THR_COMP_SPLITLA] += 4500;
-  sf->thresh_mult[THR_COMP_SPLITGA] += 4500;
-
   sf->thresh_mult[THR_ZEROMV] += 2000;
   sf->thresh_mult[THR_ZEROG] += 2000;
   sf->thresh_mult[THR_ZEROA] += 2000;
   sf->thresh_mult[THR_COMP_ZEROLA] += 2500;
   sf->thresh_mult[THR_COMP_ZEROGA] += 2500;
 
-  sf->thresh_mult[THR_B_PRED] += 2500;
   sf->thresh_mult[THR_H_PRED] += 2000;
   sf->thresh_mult[THR_V_PRED] += 2000;
   sf->thresh_mult[THR_D45_PRED ] += 2500;
@@ -606,49 +630,24 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
   sf->thresh_mult[THR_D207_PRED] += 2500;
   sf->thresh_mult[THR_D63_PRED] += 2500;
 
-  if (cpi->sf.skip_lots_of_modes) {
-    for (i = 0; i < MAX_MODES; ++i)
-      sf->thresh_mult[i] = INT_MAX;
-
-    sf->thresh_mult[THR_DC] = 2000;
-    sf->thresh_mult[THR_TM] = 2000;
-    sf->thresh_mult[THR_NEWMV] = 4000;
-    sf->thresh_mult[THR_NEWG] = 4000;
-    sf->thresh_mult[THR_NEWA] = 4000;
-    sf->thresh_mult[THR_NEARESTMV] = 0;
-    sf->thresh_mult[THR_NEARESTG] = 0;
-    sf->thresh_mult[THR_NEARESTA] = 0;
-    sf->thresh_mult[THR_NEARMV] = 2000;
-    sf->thresh_mult[THR_NEARG] = 2000;
-    sf->thresh_mult[THR_NEARA] = 2000;
-    sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
-    sf->thresh_mult[THR_SPLITMV] = 2500;
-    sf->thresh_mult[THR_SPLITG] = 2500;
-    sf->thresh_mult[THR_SPLITA] = 2500;
-    sf->recode_loop = 0;
-  }
-
   /* disable frame modes if flags not set */
   if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
     sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
     sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
     sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
     sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
-    sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
   }
   if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
     sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
     sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
     sf->thresh_mult[THR_NEARG    ] = INT_MAX;
     sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-    sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
   }
   if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
     sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
     sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
     sf->thresh_mult[THR_NEARA    ] = INT_MAX;
     sf->thresh_mult[THR_NEWA     ] = INT_MAX;
-    sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
   }
 
   if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
@@ -657,7 +656,6 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
     sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
     sf->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;
     sf->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_SPLITLA  ] = INT_MAX;
   }
   if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
       (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
@@ -665,17 +663,42 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
     sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
     sf->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
     sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_SPLITGA  ] = INT_MAX;
   }
+}
+
+static void set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi, int mode) {
+  SPEED_FEATURES *sf = &cpi->sf;
+  int i;
+
+  for (i = 0; i < MAX_REFS; ++i)
+    sf->thresh_mult_sub8x8[i] = mode == 0 ? -500 : 0;
 
-  if (sf->disable_splitmv == 1) {
-    sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-    sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
-    sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
+  sf->thresh_mult_sub8x8[THR_LAST] += 2500;
+  sf->thresh_mult_sub8x8[THR_GOLD] += 2500;
+  sf->thresh_mult_sub8x8[THR_ALTR] += 2500;
+  sf->thresh_mult_sub8x8[THR_INTRA] += 2500;
+  sf->thresh_mult_sub8x8[THR_COMP_LA] += 4500;
+  sf->thresh_mult_sub8x8[THR_COMP_GA] += 4500;
 
-    sf->thresh_mult[THR_COMP_SPLITLA  ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_SPLITGA  ] = INT_MAX;
+  // Check for masked out split cases.
+  for (i = 0; i < MAX_REFS; i++) {
+    if (sf->disable_split_mask & (1 << i))
+      sf->thresh_mult_sub8x8[i] = INT_MAX;
   }
+
+  // disable mode test if frame flag is not set
+  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG))
+    sf->thresh_mult_sub8x8[THR_LAST] = INT_MAX;
+  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG))
+    sf->thresh_mult_sub8x8[THR_GOLD] = INT_MAX;
+  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG))
+    sf->thresh_mult_sub8x8[THR_ALTR] = INT_MAX;
+  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
+      (VP9_LAST_FLAG | VP9_ALT_FLAG))
+    sf->thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX;
+  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
+      (VP9_GOLD_FLAG | VP9_ALT_FLAG))
+    sf->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX;
 }
 
 void vp9_set_speed_features(VP9_COMP *cpi) {
@@ -688,12 +711,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   if (mode > 1)
     mode = 1;
 
-  // Initialise default mode frequency sampling variables
-  for (i = 0; i < MAX_MODES; i ++) {
-    cpi->mode_check_freq[i] = 0;
-    cpi->mode_test_hit_counts[i] = 0;
+  for (i = 0; i < MAX_MODES; ++i)
     cpi->mode_chosen_counts[i] = 0;
-  }
 
   // best quality defaults
   sf->RD = 1;
@@ -708,30 +727,28 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->adaptive_rd_thresh = 0;
-  sf->use_lastframe_partitioning = 0;
+  sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
   sf->use_avoid_tested_higherror = 0;
   sf->reference_masking = 0;
-  sf->skip_lots_of_modes = 0;
-  sf->partition_by_variance = 0;
   sf->use_one_partition_size_always = 0;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = 0;
-  sf->auto_min_max_partition_interval = 0;
-  sf->auto_min_max_partition_count = 0;
   sf->max_partition_size = BLOCK_64X64;
   sf->min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
   sf->last_partitioning_redo_frequency = 4;
-  sf->disable_splitmv = 0;
+  sf->disable_split_mask = 0;
   sf->mode_search_skip_flags = 0;
   sf->disable_split_var_thresh = 0;
   sf->disable_filter_search_var_thresh = 0;
-  sf->intra_y_mode_mask = ALL_INTRA_MODES;
-  sf->intra_uv_mode_mask = ALL_INTRA_MODES;
+  for (i = 0; i < TX_SIZES; i++) {
+    sf->intra_y_mode_mask[i] = ALL_INTRA_MODES;
+    sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES;
+  }
   sf->use_rd_breakout = 0;
   sf->skip_encode_sb = 0;
   sf->use_uv_intra_rd_estimate = 0;
@@ -747,8 +764,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->static_segmentation = 0;
 #endif
 
+  sf->variance_adaptive_quantization = 0;
+
   switch (mode) {
-    case 0: // best quality mode
+    case 0:  // This is the best quality mode.
       break;
 
     case 1:
@@ -759,121 +778,148 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
       sf->static_segmentation = 0;
 #endif
       sf->use_avoid_tested_higherror = 1;
-      sf->adaptive_rd_thresh = MIN((speed + 1), 4);
+      sf->adaptive_rd_thresh = 1;
+      sf->recode_loop = (speed < 1);
 
       if (speed == 1) {
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+        sf->use_square_partition_only = !frame_is_intra_only(&cpi->common);
+        sf->less_rectangular_check  = 1;
+        sf->tx_size_search_method = frame_is_intra_only(&cpi->common)
+                                     ? USE_FULL_RD : USE_LARGESTALL;
+
+        if (MIN(cpi->common.width, cpi->common.height) >= 720)
+          sf->disable_split_mask = cpi->common.show_frame ?
+              DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+        else
+          sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+
+        sf->use_rd_breakout = 1;
+        sf->adaptive_motion_search = 1;
+        sf->auto_mv_step_size = 1;
+        sf->adaptive_rd_thresh = 2;
+        sf->recode_loop = 2;
+        sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+        sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+        sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+      }
+      if (speed == 2) {
+        sf->use_square_partition_only = !frame_is_intra_only(&cpi->common);
         sf->less_rectangular_check  = 1;
-        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
-                                      cpi->common.intra_only ||
-                                      cpi->common.show_frame == 0) ?
-                                     USE_FULL_RD :
-                                     USE_LARGESTALL);
-        sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME ||
-                                   cpi->common.intra_only ||
-                                   cpi->common.show_frame == 0);
-        sf->disable_splitmv =
-            (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
+        sf->tx_size_search_method = frame_is_intra_only(&cpi->common)
+                                     ? USE_FULL_RD : USE_LARGESTALL;
+
+        if (MIN(cpi->common.width, cpi->common.height) >= 720)
+          sf->disable_split_mask = cpi->common.show_frame ?
+              DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+        else
+          sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+
+
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
                                      FLAG_SKIP_INTRA_LOWVAR;
-        sf->use_uv_intra_rd_estimate = 1;
+
         sf->use_rd_breakout = 1;
-        sf->skip_encode_sb = 1;
-        sf->use_lp32x32fdct = 1;
         sf->adaptive_motion_search = 1;
         sf->auto_mv_step_size = 1;
 
-        sf->auto_min_max_partition_size = 1;
-        sf->auto_min_max_partition_interval = 1;
-        // FIXME(jingning): temporarily turn off disable_split_var_thresh
-        // during refactoring process. will get this back after finishing
-        // the main framework of partition search type.
-        sf->disable_split_var_thresh = 0;
         sf->disable_filter_search_var_thresh = 16;
-
-        sf->intra_y_mode_mask = INTRA_DC_TM_H_V;
-        sf->intra_uv_mode_mask = INTRA_DC_TM_H_V;
-        sf->use_fast_coef_updates = 1;
-        sf->mode_skip_start = 9;
-      }
-      if (speed == 2) {
-        sf->less_rectangular_check  = 1;
-        sf->use_square_partition_only = 1;
         sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-        sf->use_lastframe_partitioning = 1;
+
+        sf->auto_min_max_partition_size = 1;
+        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
         sf->adjust_partitioning_from_last_frame = 1;
         sf->last_partitioning_redo_frequency = 3;
-        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
-                                      cpi->common.intra_only ||
-                                      cpi->common.show_frame == 0) ?
-                                     USE_FULL_RD :
-                                     USE_LARGESTALL);
+
+        sf->adaptive_rd_thresh = 2;
+        sf->recode_loop = 2;
+        sf->mode_skip_start = 11;
+        sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+        sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+        sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+        sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+      }
+      if (speed == 3) {
+        sf->use_square_partition_only = 1;
+        sf->tx_size_search_method = USE_LARGESTALL;
+
+        if (MIN(cpi->common.width, cpi->common.height) >= 720)
+          sf->disable_split_mask = DISABLE_ALL_SPLIT;
+        else
+          sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_COMP_REFMISMATCH |
-                                     FLAG_SKIP_INTRA_LOWVAR |
-                                     FLAG_EARLY_TERMINATE;
-        sf->intra_y_mode_mask = INTRA_DC_TM;
-        sf->intra_uv_mode_mask = INTRA_DC_TM;
-        sf->use_uv_intra_rd_estimate = 1;
+                                     FLAG_SKIP_INTRA_LOWVAR;
+
         sf->use_rd_breakout = 1;
-        sf->skip_encode_sb = 1;
-        sf->use_lp32x32fdct = 1;
         sf->adaptive_motion_search = 1;
-        sf->using_small_partition_info = 0;
-        sf->disable_splitmv =
-            (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
         sf->auto_mv_step_size = 1;
-        sf->search_method = SQUARE;
-        sf->subpel_iters_per_step = 1;
-        sf->use_fast_lpf_pick = 1;
+
+        sf->disable_filter_search_var_thresh = 16;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+
         sf->auto_min_max_partition_size = 1;
-        sf->auto_min_max_partition_interval = 2;
-        sf->disable_split_var_thresh = 32;
-        sf->disable_filter_search_var_thresh = 32;
+        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+        sf->adjust_partitioning_from_last_frame = 1;
+        sf->last_partitioning_redo_frequency = 3;
+
+        sf->use_uv_intra_rd_estimate = 1;
+        sf->skip_encode_sb = 1;
+        sf->use_lp32x32fdct = 1;
+        sf->subpel_iters_per_step = 1;
         sf->use_fast_coef_updates = 2;
-        sf->mode_skip_start = 9;
+
+        sf->adaptive_rd_thresh = 4;
+        sf->mode_skip_start = 6;
       }
-      if (speed == 3) {
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-        sf->partition_by_variance = 1;
-        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
-                                      cpi->common.intra_only ||
-                                      cpi->common.show_frame == 0) ?
-                                     USE_FULL_RD :
-                                     USE_LARGESTALL);
+      if (speed == 4) {
+        sf->use_square_partition_only = 1;
+        sf->tx_size_search_method = USE_LARGESTALL;
+        sf->disable_split_mask = DISABLE_ALL_SPLIT;
+
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
                                      FLAG_SKIP_COMP_REFMISMATCH |
                                      FLAG_SKIP_INTRA_LOWVAR |
                                      FLAG_EARLY_TERMINATE;
+
         sf->use_rd_breakout = 1;
+        sf->adaptive_motion_search = 1;
+        sf->auto_mv_step_size = 1;
+
+        sf->disable_filter_search_var_thresh = 16;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+
+        sf->auto_min_max_partition_size = 1;
+        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+        sf->adjust_partitioning_from_last_frame = 1;
+        sf->last_partitioning_redo_frequency = 3;
+
+        sf->use_uv_intra_rd_estimate = 1;
         sf->skip_encode_sb = 1;
         sf->use_lp32x32fdct = 1;
-        sf->disable_splitmv = 1;
-        sf->auto_mv_step_size = 1;
-        sf->search_method = BIGDIA;
         sf->subpel_iters_per_step = 1;
-        sf->disable_split_var_thresh = 64;
-        sf->disable_filter_search_var_thresh = 64;
-        sf->intra_y_mode_mask = INTRA_DC_ONLY;
-        sf->intra_uv_mode_mask = INTRA_DC_ONLY;
         sf->use_fast_coef_updates = 2;
-        sf->mode_skip_start = 9;
+
+        sf->adaptive_rd_thresh = 4;
+        sf->mode_skip_start = 6;
+
+        /* sf->intra_y_mode_mask = INTRA_DC_ONLY;
+        sf->intra_uv_mode_mask = INTRA_DC_ONLY;
+        sf->search_method = BIGDIA;
+        sf->disable_split_var_thresh = 64;
+        sf->disable_filter_search_var_thresh = 64; */
       }
-      if (speed == 4) {
+      if (speed == 5) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
         sf->use_one_partition_size_always = 1;
         sf->always_this_block_size = BLOCK_16X16;
-        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
-                                      cpi->common.intra_only ||
-                                      cpi->common.show_frame == 0) ?
-                                     USE_FULL_RD :
-                                     USE_LARGESTALL);
+        sf->tx_size_search_method = frame_is_intra_only(&cpi->common) ?
+                                     USE_FULL_RD : USE_LARGESTALL;
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
@@ -887,22 +933,25 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         // sf->reduce_first_step_size = 1;
         // sf->reference_masking = 1;
 
-        sf->disable_splitmv = 1;
+        sf->disable_split_mask = DISABLE_ALL_SPLIT;
         sf->search_method = HEX;
         sf->subpel_iters_per_step = 1;
         sf->disable_split_var_thresh = 64;
         sf->disable_filter_search_var_thresh = 96;
-        sf->intra_y_mode_mask = INTRA_DC_ONLY;
-        sf->intra_uv_mode_mask = INTRA_DC_ONLY;
+        for (i = 0; i < TX_SIZES; i++) {
+          sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
+          sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+        }
         sf->use_fast_coef_updates = 2;
-        sf->mode_skip_start = 9;
+        sf->adaptive_rd_thresh = 4;
+        sf->mode_skip_start = 6;
       }
       break;
-
   }; /* switch */
 
   // Set rd thresholds based on mode and speed setting
   set_rd_speed_thresholds(cpi, mode);
+  set_rd_speed_thresholds_sub8x8(cpi, mode);
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
@@ -910,16 +959,16 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
     sf->optimize_coefficients = 0;
   }
 
-  cpi->mb.fwd_txm16x16  = vp9_short_fdct16x16;
-  cpi->mb.fwd_txm8x8    = vp9_short_fdct8x8;
-  cpi->mb.fwd_txm8x4    = vp9_short_fdct8x4;
-  cpi->mb.fwd_txm4x4    = vp9_short_fdct4x4;
-  if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
-    cpi->mb.fwd_txm8x4    = vp9_short_walsh8x4;
-    cpi->mb.fwd_txm4x4    = vp9_short_walsh4x4;
+  // No recode for 1 pass.
+  if (cpi->pass == 0) {
+    sf->recode_loop = 0;
+    sf->optimize_coefficients = 0;
   }
 
-  cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
+  cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
+  if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
+    cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
+  }
 
   if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_iterative;
@@ -954,20 +1003,6 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
                        "Failed to allocate altref buffer");
 }
 
-static int alloc_partition_data(VP9_COMP *cpi) {
-  vpx_free(cpi->mb.pip);
-
-  cpi->mb.pip = vpx_calloc(cpi->common.mode_info_stride *
-                           (cpi->common.mi_rows + MI_BLOCK_SIZE),
-                           sizeof(PARTITION_INFO));
-  if (!cpi->mb.pip)
-    return 1;
-
-  cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;
-
-  return 0;
-}
-
 void vp9_alloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
 
@@ -975,10 +1010,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffers");
 
-  if (alloc_partition_data(cpi))
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate partition data");
-
   if (vp9_alloc_frame_buffer(&cpi->last_frame_uf,
                              cm->width, cm->height,
                              cm->subsampling_x, cm->subsampling_y,
@@ -1001,11 +1032,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
     CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
   }
 
-  // Data used for real time vc mode to see if gf needs refreshing
-  cpi->inter_zz_count = 0;
-  cpi->gf_bad_count = 0;
-  cpi->gf_update_recommended = 0;
-
   vpx_free(cpi->mb_activity_map);
   CHECK_MEM_ERROR(cm, cpi->mb_activity_map,
                   vpx_calloc(sizeof(unsigned int),
@@ -1015,6 +1041,19 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
   CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map,
                   vpx_calloc(sizeof(unsigned int),
                              cm->mb_rows * cm->mb_cols));
+
+  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
+  // block where mi unit size is 8x8.
+  vpx_free(cpi->above_context[0]);
+  CHECK_MEM_ERROR(cm, cpi->above_context[0],
+                  vpx_calloc(2 * mi_cols_aligned_to_sb(cm->mi_cols) *
+                             MAX_MB_PLANE,
+                             sizeof(*cpi->above_context[0])));
+
+  vpx_free(cpi->above_seg_context);
+  CHECK_MEM_ERROR(cm, cpi->above_seg_context,
+                  vpx_calloc(mi_cols_aligned_to_sb(cm->mi_cols),
+                             sizeof(*cpi->above_seg_context)));
 }
 
 
@@ -1047,13 +1086,18 @@ static void update_frame_size(VP9_COMP *cpi) {
       vp9_init_dsmotion_compensation(&cpi->mb, y_stride);
     }
   }
+
+  {
+    int i;
+    for (i = 1; i < MAX_MB_PLANE; ++i) {
+      cpi->above_context[i] = cpi->above_context[0] +
+                              i * sizeof(*cpi->above_context[0]) * 2 *
+                              mi_cols_aligned_to_sb(cm->mi_cols);
+    }
+  }
 }
 
 
-// TODO perhaps change number of steps expose to outside world when setting
-// max and min limits. Also this will likely want refining for the extended Q
-// range.
-//
 // Table that converts 0-63 Q range values passed in outside to the Qindex
 // range used internally.
 static const int q_trans[] = {
@@ -1080,11 +1124,14 @@ void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
   if (framerate < 0.1)
     framerate = 30;
 
-  cpi->oxcf.framerate             = framerate;
-  cpi->output_framerate            = cpi->oxcf.framerate;
-  cpi->per_frame_bandwidth          = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
-  cpi->av_per_frame_bandwidth        = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
-  cpi->min_frame_bandwidth          = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+  cpi->oxcf.framerate = framerate;
+  cpi->output_framerate = cpi->oxcf.framerate;
+  cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
+                             / cpi->output_framerate);
+  cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
+                                / cpi->output_framerate);
+  cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
+                                   cpi->oxcf.two_pass_vbrmin_section / 100);
 
 
   cpi->min_frame_bandwidth = MAX(cpi->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
@@ -1133,7 +1180,6 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   int i;
 
   cpi->oxcf = *oxcf;
-  cpi->goldfreq = 7;
 
   cm->version = oxcf->version;
 
@@ -1195,6 +1241,12 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 
   switch (cpi->oxcf.Mode) {
       // Real time and one pass deprecated in test code base
+    case MODE_GOODQUALITY:
+      cpi->pass = 0;
+      cpi->compressor_speed = 2;
+      cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5);
+      break;
+
     case MODE_FIRSTPASS:
       cpi->pass = 1;
       cpi->compressor_speed = 1;
@@ -1217,14 +1269,8 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
 
   cpi->oxcf.lossless = oxcf->lossless;
-  if (cpi->oxcf.lossless) {
-    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
-  } else {
-    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
-  }
-
+  cpi->mb.e_mbd.itxm_add = cpi->oxcf.lossless ? vp9_iwht4x4_add
+                                              : vp9_idct4x4_add;
   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
 
   cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
@@ -1237,8 +1283,8 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cm->reset_frame_context = 0;
 
   setup_features(cm);
-  cpi->mb.e_mbd.allow_high_precision_mv = 0;   // Default mv precision adaptation
-  set_mvcost(&cpi->mb);
+  cpi->common.allow_high_precision_mv = 0;  // Default mv precision
+  set_mvcost(cpi);
 
   {
     int i;
@@ -1390,6 +1436,94 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
   } while (++i <= MV_MAX);
 }
 
+static void init_pick_mode_context(VP9_COMP *cpi) {
+  int i;
+  MACROBLOCK  *x  = &cpi->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  VP9_COMMON  *cm = &cpi->common;
+
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[i];
+    const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
+    if (i < BLOCK_16X16) {
+      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+        for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
+          for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+            PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+            ctx->num_4x4_blk = num_4x4_blk;
+            CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                            vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+          }
+        }
+      }
+    } else if (i < BLOCK_32X32) {
+      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+        for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
+                               ++xd->mb_index) {
+          PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+          ctx->num_4x4_blk = num_4x4_blk;
+          CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                          vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+        }
+      }
+    } else if (i < BLOCK_64X64) {
+      for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+        PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+        ctx->num_4x4_blk = num_4x4_blk;
+        CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                        vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+      }
+    } else {
+      PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+      ctx->num_4x4_blk = num_4x4_blk;
+      CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                      vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+    }
+  }
+}
+
+static void free_pick_mode_context(MACROBLOCK *x) {
+  int i;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[i];
+    const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
+    if (i < BLOCK_16X16) {
+      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+        for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
+          for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+            PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+            vpx_free(ctx->zcoeff_blk);
+            ctx->zcoeff_blk = 0;
+          }
+        }
+      }
+    } else if (i < BLOCK_32X32) {
+      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+        for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
+                               ++xd->mb_index) {
+          PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+          vpx_free(ctx->zcoeff_blk);
+          ctx->zcoeff_blk = 0;
+        }
+      }
+    } else if (i < BLOCK_64X64) {
+      for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+        PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+        vpx_free(ctx->zcoeff_blk);
+        ctx->zcoeff_blk = 0;
+      }
+    } else {
+      PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+      vpx_free(ctx->zcoeff_blk);
+      ctx->zcoeff_blk = 0;
+    }
+  }
+}
+
 VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   int i, j;
   volatile union {
@@ -1426,6 +1560,8 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
 
   init_config((VP9_PTR)cpi, oxcf);
 
+  init_pick_mode_context(cpi);
+
   cm->current_video_frame   = 0;
   cpi->kf_overspend_bits            = 0;
   cpi->kf_bitrate_adjustment        = 0;
@@ -1478,7 +1614,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   /*Initialize the feed-forward activity masking.*/
   cpi->activity_avg = 90 << 12;
 
-  cpi->frames_since_key = 8;        // Give a sensible default for the first frame.
+  cpi->frames_since_key = 8;  // Sensible default for first frame.
   cpi->key_frame_frequency = cpi->oxcf.key_freq;
   cpi->this_key_frame_forced = 0;
   cpi->next_key_frame_forced = 0;
@@ -1599,9 +1735,12 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   vp9_set_speed_features(cpi);
 
   // Default rd threshold factors for mode selection
-  for (i = 0; i < BLOCK_SIZES; ++i)
+  for (i = 0; i < BLOCK_SIZES; ++i) {
     for (j = 0; j < MAX_MODES; ++j)
       cpi->rd_thresh_freq_fact[i][j] = 32;
+    for (j = 0; j < MAX_REFS; ++j)
+      cpi->rd_thresh_freq_sub8x8[i][j] = 32;
+  }
 
 #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
             SDX3F, SDX8F, SDX4DF)\
@@ -1712,10 +1851,10 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
 
   cpi->common.error.setjmp = 0;
 
-  vp9_zero(cpi->y_uv_mode_count)
+  vp9_zero(cpi->y_uv_mode_count);
 
 #ifdef MODE_TEST_HIT_STATS
-  vp9_zero(cpi->mode_test_hits)
+  vp9_zero(cpi->mode_test_hits);
 #endif
 
   return (VP9_PTR) cpi;
@@ -1757,8 +1896,10 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
       FILE *f = fopen("opsnr.stt", "a");
       double time_encoded = (cpi->last_end_time_stamp_seen
                              - cpi->first_time_stamp_ever) / 10000000.000;
-      double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data)   / 1000.000;
-      double dr = (double)cpi->bytes * (double) 8 / (double)1000  / time_encoded;
+      double total_encode_time = (cpi->time_receive_data +
+                                  cpi->time_compress_data)   / 1000.000;
+      double dr = (double)cpi->bytes * (double) 8 / (double)1000
+                  / time_encoded;
 
       if (cpi->b_calculate_psnr) {
         YV12_BUFFER_CONFIG *lst_yv12 =
@@ -1778,20 +1919,15 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
                 dr, cpi->total / cpi->count, total_psnr,
                 cpi->totalp / cpi->count, total_psnr2, total_ssim, total_ssimp,
                 total_encode_time);
-//         fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",
-//                 dr, cpi->total / cpi->count, total_psnr,
-//                 cpi->totalp / cpi->count, total_psnr2, total_ssim,
-//                 total_encode_time, cpi->tot_recode_hits);
       }
 
       if (cpi->b_calculate_ssimg) {
         fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t  Time(ms)\n");
         fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
-                cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
-                cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time);
-//                fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f  %10ld\n", dr,
-//                        cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
-//                        cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time, cpi->tot_recode_hits);
+                cpi->total_ssimg_y / cpi->count,
+                cpi->total_ssimg_u / cpi->count,
+                cpi->total_ssimg_v / cpi->count,
+                cpi->total_ssimg_all / cpi->count, total_encode_time);
       }
 
       fclose(f);
@@ -1838,11 +1974,9 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
                      "[INTRA_MODES] =\n{\n");
 
       for (i = 0; i < INTRA_MODES; i++) {
-
         fprintf(fmode, "    { // Above Mode :  %d\n", i);
 
         for (j = 0; j < INTRA_MODES; j++) {
-
           fprintf(fmode, "        {");
 
           for (k = 0; k < INTRA_MODES; k++) {
@@ -1853,11 +1987,9 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
           }
 
           fprintf(fmode, "}, // left_mode %d\n", j);
-
         }
 
         fprintf(fmode, "    },\n");
-
       }
 
       fprintf(fmode, "};\n");
@@ -1891,14 +2023,15 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
              (cpi->time_receive_data + cpi->time_compress_data) / 1000);
     }
 #endif
-
   }
 
+  free_pick_mode_context(&cpi->mb);
   dealloc_compressor_data(cpi);
   vpx_free(cpi->mb.ss);
   vpx_free(cpi->tok);
 
-  for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]); i++) {
+  for (i = 0; i < sizeof(cpi->mbgraph_stats) /
+                  sizeof(cpi->mbgraph_stats[0]); ++i) {
     vpx_free(cpi->mbgraph_stats[i].mb_stats);
   }
 
@@ -1925,7 +2058,6 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
     fclose(kf_list);
 
 #endif
-
 }
 
 
@@ -2246,14 +2378,15 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
     cpi->frames_since_golden = 0;
 
     // ******** Fixed Q test code only ************
-    // If we are going to use the ALT reference for the next group of frames set a flag to say so.
+    // If we are going to use the ALT reference for the next group of frames
+    // set a flag to say so.
     if (cpi->oxcf.fixed_q >= 0 &&
         cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) {
       cpi->source_alt_ref_pending = 1;
       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
 
-      // TODO(ivan): for SVC encoder, GF automatic update is disabled by using a
-      // large GF_interval
+      // TODO(ivan): For SVC encoder, GF automatic update is disabled by using
+      // a large GF_interval.
       if (cpi->use_svc) {
         cpi->frames_till_gf_update_due = INT_MAX;
       }
@@ -2293,12 +2426,12 @@ static int find_fp_qindex() {
   return i;
 }
 
-static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) {
+static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest,
+                        unsigned int *frame_flags) {
   (void) size;
   (void) dest;
   (void) frame_flags;
 
-
   vp9_set_quantizer(cpi, find_fp_qindex());
   vp9_first_pass(cpi);
 }
@@ -2306,13 +2439,11 @@ static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest,
 #define WRITE_RECON_BUFFER 0
 #if WRITE_RECON_BUFFER
 void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-
-  // write the frame
   FILE *yframe;
   int i;
   char filename[255];
 
-  sprintf(filename, "cx\\y%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->y_height; i++)
@@ -2320,7 +2451,7 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
            frame->y_width, 1, yframe);
 
   fclose(yframe);
-  sprintf(filename, "cx\\u%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->uv_height; i++)
@@ -2328,7 +2459,7 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
            frame->uv_width, 1, yframe);
 
   fclose(yframe);
-  sprintf(filename, "cx\\v%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->uv_height; i++)
@@ -2350,8 +2481,10 @@ static double compute_edge_pixel_proportion(YV12_BUFFER_CONFIG *frame) {
   for (i = 1; i < frame->y_height - 1; i++) {
     for (j = 1; j < frame->y_width - 1; j++) {
       /* Sobel hor and ver gradients */
-      int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) + (next[1] - next[-1]);
-      int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + (prev[-1] - next[-1]);
+      int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) +
+              (next[1] - next[-1]);
+      int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) +
+              (prev[-1] - next[-1]);
       h = (h < 0 ? -h : h);
       v = (v < 0 ? -v : v);
       if (h > EDGE_THRESH || v > EDGE_THRESH)
@@ -2387,10 +2520,9 @@ static int recode_loop_test(VP9_COMP *cpi,
     if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
         ((cpi->projected_frame_size < low_limit) && (q > minq))) {
       force_recode = 1;
-    }
-    // Special Constrained quality tests
-    else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      // Undershoot and below auto cq level
+    } else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+      // Deal with frame undershoot and whether or not we are
+      // below the automatically set cq level.
       if (q > cpi->cq_target_quality &&
           cpi->projected_frame_size < ((cpi->this_frame_target * 7) >> 3)) {
         force_recode = 1;
@@ -2551,159 +2683,81 @@ static void full_to_model_counts(
         }
 }
 
+#if 0 && CONFIG_INTERNAL_STATS
+static void output_frame_level_debug_stats(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
+  int recon_err;
 
-static void encode_frame_to_data_rate(VP9_COMP *cpi,
-                                      unsigned long *size,
-                                      unsigned char *dest,
-                                      unsigned int *frame_flags) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  TX_SIZE t;
-  int q;
-  int frame_over_shoot_limit;
-  int frame_under_shoot_limit;
-
-  int loop = 0;
-  int loop_count;
-
-  int q_low;
-  int q_high;
-
-  int top_index;
-  int bottom_index;
-  int active_worst_qchanged = 0;
-
-  int overshoot_seen = 0;
-  int undershoot_seen = 0;
-
-  SPEED_FEATURES *sf = &cpi->sf;
-  unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
-  struct segmentation *seg = &cm->seg;
-
-  /* Scale the source buffer, if required */
-  if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
-      cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
-    scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);
-    cpi->Source = &cpi->scaled_source;
-  } else {
-    cpi->Source = cpi->un_scaled_source;
-  }
-
-  scale_references(cpi);
-
-  // Clear down mmx registers to allow floating point in what follows
-  vp9_clear_system_state();
-
-
-  // For an alt ref frame in 2 pass we skip the call to the second
-  // pass function that sets the target bandwidth so must set it here
-  if (cpi->refresh_alt_ref_frame) {
-    // Per frame bit target for the alt ref frame
-    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
-    // per second target bitrate
-    cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
-                                  cpi->output_framerate);
-  }
-
-  // Clear zbin over-quant value and mode boost values.
-  cpi->zbin_mode_boost = 0;
-
-  // Enable or disable mode based tweaking of the zbin
-  // For 2 Pass Only used where GF/ARF prediction quality
-  // is above a threshold
-  cpi->zbin_mode_boost = 0;
-
-  // if (cpi->oxcf.lossless)
-    cpi->zbin_mode_boost_enabled = 0;
-  // else
-  //   cpi->zbin_mode_boost_enabled = 1;
-
-  // Current default encoder behaviour for the altref sign bias
-    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active;
-
-  // Check to see if a key frame is signaled
-  // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.
-  if ((cm->current_video_frame == 0) ||
-      (cm->frame_flags & FRAMEFLAGS_KEY) ||
-      (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0))) {
-    // Key frame from VFW/auto-keyframe/first frame
-    cm->frame_type = KEY_FRAME;
-  }
+  vp9_clear_system_state();  // __asm emms;
 
-  // Set default state for segment based loop filter update flags
-  cm->lf.mode_ref_delta_update = 0;
+  recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
+
+  if (cpi->twopass.total_left_stats.coded_error != 0.0)
+    fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
+        "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
+        "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
+        "%10.3f %8d %10d %10d %10d\n",
+        cpi->common.current_video_frame, cpi->this_frame_target,
+        cpi->projected_frame_size, 0,
+        (cpi->projected_frame_size - cpi->this_frame_target),
+        (int)cpi->total_target_vs_actual,
+        (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
+        (int)cpi->total_actual_bits, cm->base_qindex,
+        vp9_convert_qindex_to_q(cm->base_qindex),
+        (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
+        vp9_convert_qindex_to_q(cpi->active_best_quality),
+        vp9_convert_qindex_to_q(cpi->active_worst_quality), cpi->avg_q,
+        vp9_convert_qindex_to_q(cpi->ni_av_qi),
+        vp9_convert_qindex_to_q(cpi->cq_target_quality),
+        cpi->refresh_last_frame, cpi->refresh_golden_frame,
+        cpi->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost,
+        cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left,
+        cpi->twopass.total_left_stats.coded_error,
+        (double)cpi->twopass.bits_left /
+            (1 + cpi->twopass.total_left_stats.coded_error),
+        cpi->tot_recode_hits, recon_err, cpi->kf_boost, cpi->kf_zeromotion_pct);
 
-  // Initialize cpi->mv_step_param to default based on max resolution
-  cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
-  // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
-  if (sf->auto_mv_step_size) {
-    if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only) {
-      // initialize max_mv_magnitude for use in the first INTER frame
-      // after a key/intra-only frame
-      cpi->max_mv_magnitude = max_mv_def;
-    } else {
-      if (cm->show_frame)
-        // allow mv_steps to correspond to twice the max mv magnitude found
-        // in the previous frame, capped by the default max_mv_magnitude based
-        // on resolution
-        cpi->mv_step_param = vp9_init_search_range(
-            cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
-      cpi->max_mv_magnitude = 0;
-    }
-  }
+  fclose(f);
 
-  // Set various flags etc to special state if it is a key frame
-  if (cm->frame_type == KEY_FRAME) {
-    // Reset the loop filter deltas and segmentation map
-    setup_features(cm);
+  if (0) {
+    FILE *const fmodes = fopen("Modes.stt", "a");
+    int i;
 
-    // If segmentation is enabled force a map update for key frames
-    if (seg->enabled) {
-      seg->update_map = 1;
-      seg->update_data = 1;
-    }
+    fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
+            cm->frame_type, cpi->refresh_golden_frame,
+            cpi->refresh_alt_ref_frame);
 
-    // The alternate reference frame cannot be active for a key frame
-    cpi->source_alt_ref_active = 0;
+    for (i = 0; i < MAX_MODES; ++i)
+      fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+    for (i = 0; i < MAX_REFS; ++i)
+      fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]);
 
-    cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
-    cm->frame_parallel_decoding_mode =
-      (cpi->oxcf.frame_parallel_decoding_mode != 0);
-    if (cm->error_resilient_mode) {
-      cm->frame_parallel_decoding_mode = 1;
-      cm->reset_frame_context = 0;
-      cm->refresh_frame_context = 0;
-    }
-  }
+    fprintf(fmodes, "\n");
 
-  // Configure experimental use of segmentation for enhanced coding of
-  // static regions if indicated.
-  // Only allowed for now in second pass of two pass (as requires lagged coding)
-  // and if the relevant speed feature flag is set.
-  if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
-    configure_static_seg_features(cpi);
+    fclose(fmodes);
   }
+}
+#endif
 
-  // Decide how big to make the frame
-  vp9_pick_frame_size(cpi);
-
-  vp9_clear_system_state();
-
+static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
+                                      int * bottom_index, int * top_index) {
   // Set an active best quality and if necessary active worst quality
-  q = cpi->active_worst_quality;
+  int q = cpi->active_worst_quality;
+  VP9_COMMON *const cm = &cpi->common;
 
-  if (cm->frame_type == KEY_FRAME) {
+  if (frame_is_intra_only(cm)) {
 #if !CONFIG_MULTIPLE_ARF
-      // Special case for key frames forced because we have reached
-      // the maximum key frame interval. Here force the Q to a range
-      // based on the ambient Q to reduce the risk of popping
+    // Handle the special case for key frames forced when we have75 reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
     if (cpi->this_key_frame_forced) {
       int delta_qindex;
       int qindex = cpi->last_boosted_qindex;
       double last_boosted_q = vp9_convert_qindex_to_q(qindex);
 
-      delta_qindex = compute_qdelta(cpi, last_boosted_q,
-                                    (last_boosted_q * 0.75));
+      delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
+                                        (last_boosted_q * 0.75));
 
       cpi->active_best_quality = MAX(qindex + delta_qindex,
                                      cpi->best_quality);
@@ -2714,18 +2768,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       double q_val;
 
       // Baseline value derived from cpi->active_worst_quality and kf boost
-      if (cpi->kf_boost > high) {
-        cpi->active_best_quality = kf_low_motion_minq[q];
-      } else if (cpi->kf_boost < low) {
-        cpi->active_best_quality = kf_high_motion_minq[q];
-      } else {
-        const int gap = high - low;
-        const int offset = high - cpi->kf_boost;
-        const int qdiff = kf_high_motion_minq[q] - kf_low_motion_minq[q];
-        const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-
-        cpi->active_best_quality = kf_low_motion_minq[q] + adjustment;
-      }
+      cpi->active_best_quality = get_active_quality(q, cpi->kf_boost,
+                                                    low, high,
+                                                    kf_low_motion_minq,
+                                                    kf_high_motion_minq);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
@@ -2739,84 +2785,78 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       // on active_best_quality.
       q_val = vp9_convert_qindex_to_q(cpi->active_best_quality);
       cpi->active_best_quality +=
-          compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
+          vp9_compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
     }
 #else
     double current_q;
     // Force the KF quantizer to be 30% of the active_worst_quality.
     current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
     cpi->active_best_quality = cpi->active_worst_quality
-        + compute_qdelta(cpi, current_q, current_q * 0.3);
+        + vp9_compute_qdelta(cpi, current_q, current_q * 0.3);
 #endif
-  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
+  } else if (!cpi->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     int high = 2000;
     int low = 400;
 
     // Use the lower of cpi->active_worst_quality and recent
-    // average Q as basis for GF/ARF Q limit unless last frame was
+    // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
     if (cpi->frames_since_key > 1 &&
         cpi->avg_frame_qindex < cpi->active_worst_quality) {
       q = cpi->avg_frame_qindex;
     }
     // For constrained quality dont allow Q less than the cq level
-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
-        q < cpi->cq_target_quality) {
-      q = cpi->cq_target_quality;
-    }
-    if (cpi->gfu_boost > high) {
-      cpi->active_best_quality = gf_low_motion_minq[q];
-    } else if (cpi->gfu_boost < low) {
-      cpi->active_best_quality = gf_high_motion_minq[q];
-    } else {
-      const int gap = high - low;
-      const int offset = high - cpi->gfu_boost;
-      const int qdiff = gf_high_motion_minq[q] - gf_low_motion_minq[q];
-      const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-
-      cpi->active_best_quality = gf_low_motion_minq[q] + adjustment;
-    }
-
-    // Constrained quality use slightly lower active best.
-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+      if (q < cpi->cq_target_quality)
+        q = cpi->cq_target_quality;
+      if (cpi->frames_since_key > 1) {
+        cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+                                                      low, high,
+                                                      afq_low_motion_minq,
+                                                      afq_high_motion_minq);
+      } else {
+        cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+                                                      low, high,
+                                                      gf_low_motion_minq,
+                                                      gf_high_motion_minq);
+      }
+      // Constrained quality use slightly lower active best.
       cpi->active_best_quality = cpi->active_best_quality * 15 / 16;
 
-    // TODO(debargha): Refine the logic below
-    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+    } else if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
       if (!cpi->refresh_alt_ref_frame) {
         cpi->active_best_quality = cpi->cq_target_quality;
       } else {
         if (cpi->frames_since_key > 1) {
-          if (cpi->gfu_boost > high) {
-            cpi->active_best_quality = cpi->cq_target_quality * 6 / 16;
-          } else if (cpi->gfu_boost < low) {
-            cpi->active_best_quality = cpi->cq_target_quality * 11 / 16;
-          } else {
-            const int gap = high - low;
-            const int offset = high - cpi->gfu_boost;
-            const int qdiff = cpi->cq_target_quality * 5 / 16;
-            const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-            cpi->active_best_quality = cpi->cq_target_quality * 6 / 16
-                + adjustment;
-          }
+          cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+                                                        low, high,
+                                                        afq_low_motion_minq,
+                                                        afq_high_motion_minq);
+        } else {
+          cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+                                                        low, high,
+                                                        gf_low_motion_minq,
+                                                        gf_high_motion_minq);
         }
       }
+    } else {
+        cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+                                                      low, high,
+                                                      gf_low_motion_minq,
+                                                      gf_high_motion_minq);
     }
   } else {
     if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
       cpi->active_best_quality = cpi->cq_target_quality;
     } else {
-#ifdef ONE_SHOT_Q_ESTIMATE
-#ifdef STRICT_ONE_SHOT_Q
-      cpi->active_best_quality = q;
-#else
-      cpi->active_best_quality = inter_minq[q];
-#endif
-#else
       cpi->active_best_quality = inter_minq[q];
-#endif
+      // 1-pass: for now, use the average Q for the active_best, if its lower
+      // than active_worst.
+      if (cpi->pass == 0 && (cpi->avg_frame_qindex < q))
+        cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex];
 
-      // For the constant/constrained quality mode we don't want
+      // For the constrained quality mode we don't want
       // q to fall below the cq level.
       if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
           (cpi->active_best_quality < cpi->cq_target_quality)) {
@@ -2844,16 +2884,189 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   if (cpi->active_worst_quality < cpi->active_best_quality)
     cpi->active_worst_quality = cpi->active_best_quality;
 
-  // Special case code to try and match quality with forced key frames
+  // Limit Q range for the adaptive loop.
+  if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
+    *top_index =
+      (cpi->active_worst_quality + cpi->active_best_quality * 3) / 4;
+    // If this is the first (key) frame in 1-pass, active best is the user
+    // best-allowed, and leave the top_index to active_worst.
+    if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
+      cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+      *top_index = cpi->oxcf.worst_allowed_q;
+    }
+  } else if (!cpi->is_src_frame_alt_ref &&
+             (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    *top_index =
+      (cpi->active_worst_quality + cpi->active_best_quality) / 2;
+  } else {
+    *top_index = cpi->active_worst_quality;
+  }
+  *bottom_index = cpi->active_best_quality;
+
   if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
     q = cpi->active_best_quality;
+  // Special case code to try and match quality with forced key frames
   } else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
     q = cpi->last_boosted_qindex;
   } else {
-    // Determine initial Q to try
-    q = vp9_regulate_q(cpi, cpi->this_frame_target);
+    // Determine initial Q to try.
+    if (cpi->pass == 0) {
+      // 1-pass: for now, use per-frame-bw for target size of frame, scaled
+      // by |x| for key frame.
+      int scale = (cm->frame_type == KEY_FRAME) ? 5 : 1;
+      q = vp9_regulate_q(cpi, scale * cpi->av_per_frame_bandwidth);
+    } else {
+      q = vp9_regulate_q(cpi, cpi->this_frame_target);
+    }
+    if (q > *top_index)
+      q = *top_index;
   }
 
+  return q;
+}
+static void encode_frame_to_data_rate(VP9_COMP *cpi,
+                                      unsigned long *size,
+                                      unsigned char *dest,
+                                      unsigned int *frame_flags) {
+  VP9_COMMON *const cm = &cpi->common;
+  TX_SIZE t;
+  int q;
+  int frame_over_shoot_limit;
+  int frame_under_shoot_limit;
+
+  int loop = 0;
+  int loop_count;
+
+  int q_low;
+  int q_high;
+
+  int top_index;
+  int bottom_index;
+  int active_worst_qchanged = 0;
+
+  int overshoot_seen = 0;
+  int undershoot_seen = 0;
+
+  SPEED_FEATURES *const sf = &cpi->sf;
+  unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
+  struct segmentation *const seg = &cm->seg;
+
+  /* Scale the source buffer, if required. */
+  if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
+      cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
+    scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);
+    cpi->Source = &cpi->scaled_source;
+  } else {
+    cpi->Source = cpi->un_scaled_source;
+  }
+  scale_references(cpi);
+
+  // Clear down mmx registers to allow floating point in what follows.
+  vp9_clear_system_state();
+
+  // For an alt ref frame in 2 pass we skip the call to the second
+  // pass function that sets the target bandwidth so we must set it here.
+  if (cpi->refresh_alt_ref_frame) {
+    // Set a per frame bit target for the alt ref frame.
+    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+    // Set a per second target bitrate.
+    cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * cpi->output_framerate);
+  }
+
+  // Clear zbin over-quant value and mode boost values.
+  cpi->zbin_mode_boost = 0;
+
+  // Enable or disable mode based tweaking of the zbin.
+  // For 2 pass only used where GF/ARF prediction quality
+  // is above a threshold.
+  cpi->zbin_mode_boost = 0;
+  cpi->zbin_mode_boost_enabled = 0;
+
+  // Current default encoder behavior for the altref sign bias.
+  cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active;
+
+  // Check to see if a key frame is signaled.
+  // For two pass with auto key frame enabled cm->frame_type may already be
+  // set, but not for one pass.
+  if ((cm->current_video_frame == 0) ||
+      (cm->frame_flags & FRAMEFLAGS_KEY) ||
+      (cpi->oxcf.auto_key && (cpi->frames_since_key %
+                              cpi->key_frame_frequency == 0))) {
+    // Set frame type to key frame for the force key frame, if we exceed the
+    // maximum distance in an automatic keyframe selection or for the first
+    // frame.
+    cm->frame_type = KEY_FRAME;
+  }
+
+  // Set default state for segment based loop filter update flags.
+  cm->lf.mode_ref_delta_update = 0;
+
+  // Initialize cpi->mv_step_param to default based on max resolution.
+  cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
+  // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
+  if (sf->auto_mv_step_size) {
+    if (frame_is_intra_only(&cpi->common)) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame)
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        cpi->mv_step_param = vp9_init_search_range(
+            cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+      cpi->max_mv_magnitude = 0;
+    }
+  }
+
+  // Set various flags etc to special state if it is a key frame.
+  if (frame_is_intra_only(cm)) {
+    vp9_setup_key_frame(cpi);
+    // Reset the loop filter deltas and segmentation map.
+    setup_features(cm);
+
+    // If segmentation is enabled force a map update for key frames.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+    }
+
+    // The alternate reference frame cannot be active for a key frame.
+    cpi->source_alt_ref_active = 0;
+
+    cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
+    cm->frame_parallel_decoding_mode =
+      (cpi->oxcf.frame_parallel_decoding_mode != 0);
+    if (cm->error_resilient_mode) {
+      cm->frame_parallel_decoding_mode = 1;
+      cm->reset_frame_context = 0;
+      cm->refresh_frame_context = 0;
+    } else if (cm->intra_only) {
+      // Only reset the current context.
+      cm->reset_frame_context = 2;
+    }
+  }
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in second pass of two pass (as requires lagged coding)
+  // and if the relevant speed feature flag is set.
+  if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
+    configure_static_seg_features(cpi);
+  }
+
+  // Decide how big to make the frame.
+  vp9_pick_frame_size(cpi);
+
+  vp9_clear_system_state();
+
+  q = pick_q_and_adjust_q_bounds(cpi, &bottom_index, &top_index);
+
+  q_high = top_index;
+  q_low  = bottom_index;
+
   vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
                                 &frame_over_shoot_limit);
 
@@ -2868,7 +3081,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
     // Set quantizer steps at 10% increments.
     new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
-    q = cpi->active_worst_quality + compute_qdelta(cpi, current_q, new_q);
+    q = cpi->active_worst_quality + vp9_compute_qdelta(cpi, current_q, new_q);
 
     bottom_index = q;
     top_index    = q;
@@ -2876,24 +3089,17 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     q_high = q;
 
     printf("frame:%d q:%d\n", cm->current_video_frame, q);
-  } else {
-#endif
-    // Limit Q range for the adaptive loop.
-    bottom_index = cpi->active_best_quality;
-    top_index    = cpi->active_worst_quality;
-    q_low  = cpi->active_best_quality;
-    q_high = cpi->active_worst_quality;
-#if CONFIG_MULTIPLE_ARF
   }
 #endif
+
   loop_count = 0;
   vp9_zero(cpi->rd_tx_select_threshes);
 
-  if (cm->frame_type != KEY_FRAME) {
+  if (!frame_is_intra_only(cm)) {
     cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
     /* TODO: Decide this more intelligently */
-    xd->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH;
-    set_mvcost(&cpi->mb);
+    cm->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH;
+    set_mvcost(cpi);
   }
 
 #if CONFIG_VP9_POSTPROC
@@ -2935,25 +3141,25 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     vp9_set_quantizer(cpi, q);
 
     if (loop_count == 0) {
-
-      // Set up entropy depending on frame type.
+      // Set up entropy context depending on frame type. The decoder mandates
+      // the use of the default context, index 0, for keyframes and inter
+      // frames where the error_resilient_mode or intra_only flag is set. For
+      // other inter-frames the encoder currently uses only two contexts;
+      // context 1 for ALTREF frames and context 0 for the others.
       if (cm->frame_type == KEY_FRAME) {
-        /* Choose which entropy context to use. When using a forward reference
-         * frame, it immediately follows the keyframe, and thus benefits from
-         * using the same entropy context established by the keyframe.
-         *  Otherwise, use the default context 0.
-         */
-        cm->frame_context_idx = cpi->oxcf.play_alternate;
         vp9_setup_key_frame(cpi);
       } else {
-        /* Choose which entropy context to use. Currently there are only two
-         * contexts used, one for normal frames and one for alt ref frames.
-         */
-        cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
+        if (!cm->intra_only && !cm->error_resilient_mode) {
+          cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
+        }
         vp9_setup_inter_frame(cpi);
       }
     }
 
+    if (cpi->sf.variance_adaptive_quantization) {
+        vp9_vaq_frame_setup(cpi);
+    }
+
     // transform / motion compensation build reconstruction frame
 
     vp9_encode_frame(cpi);
@@ -2977,14 +3183,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       frame_over_shoot_limit = 1;
     active_worst_qchanged = 0;
 
-    // Special case handling for forced key frames
     if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
       loop = 0;
     } else {
+      // Special case handling for forced key frames
       if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
         int last_q = q;
-        int kf_err = vp9_calc_ss_err(cpi->Source,
-                                     &cm->yv12_fb[cm->new_fb_idx]);
+        int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
         int high_err_target = cpi->ambient_err;
         int low_err_target = cpi->ambient_err >> 1;
@@ -3120,14 +3325,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
   if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) {
-    cpi->ambient_err = vp9_calc_ss_err(cpi->Source,
-                                       &cm->yv12_fb[cm->new_fb_idx]);
+    cpi->ambient_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
   }
 
   if (cm->frame_type == KEY_FRAME)
     cpi->refresh_last_frame = 1;
 
-  cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+  cm->frame_to_show = get_frame_new_buffer(cm);
 
 #if WRITE_RECON_BUFFER
   if (cm->show_frame)
@@ -3168,7 +3372,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     vp9_adapt_coef_probs(&cpi->common);
   }
 
-  if (cpi->common.frame_type != KEY_FRAME) {
+  if (!frame_is_intra_only(&cpi->common)) {
     FRAME_COUNTS *counts = &cpi->common.counts;
 
     vp9_copy(counts->y_mode, cpi->y_mode_count);
@@ -3182,7 +3386,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     if (!cpi->common.error_resilient_mode &&
         !cpi->common.frame_parallel_decoding_mode) {
       vp9_adapt_mode_probs(&cpi->common);
-      vp9_adapt_mv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
+      vp9_adapt_mv_probs(&cpi->common, cpi->common.allow_high_precision_mv);
     }
   }
 
@@ -3198,8 +3402,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   cpi->total_byte_count += (*size);
   cpi->projected_frame_size = (*size) << 3;
 
+  // Post encode loop adjustment of Q prediction.
   if (!active_worst_qchanged)
-    vp9_update_rate_correction_factors(cpi, 2);
+    vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop) ? 2 : 0);
 
   cpi->last_q[cm->frame_type] = cm->base_qindex;
 
@@ -3222,9 +3427,11 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   // Keep a record of ambient average Q.
   if (cm->frame_type != KEY_FRAME)
-    cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
+    cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex +
+                            cm->base_qindex) >> 2;
 
-  // Keep a record from which we can calculate the average Q excluding GF updates and key frames
+  // Keep a record from which we can calculate the average Q excluding GF
+  // updates and key frames.
   if (cm->frame_type != KEY_FRAME &&
       !cpi->refresh_golden_frame &&
       !cpi->refresh_alt_ref_frame) {
@@ -3242,7 +3449,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   if (!cm->show_frame)
     cpi->bits_off_target -= cpi->projected_frame_size;
   else
-    cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
+    cpi->bits_off_target += cpi->av_per_frame_bandwidth -
+                            cpi->projected_frame_size;
 
   // Clip the buffer level at the maximum buffer size
   if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
@@ -3266,125 +3474,30 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   cpi->total_actual_bits += cpi->projected_frame_size;
 
   // Debug stats
-  cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
+  cpi->total_target_vs_actual += (cpi->this_frame_target -
+                                  cpi->projected_frame_size);
 
   cpi->buffer_level = cpi->bits_off_target;
 
-  // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames
+#ifndef DISABLE_RC_LONG_TERM_MEM
+  // Update bits left to the kf and gf groups to account for overshoot or
+  // undershoot on these frames
   if (cm->frame_type == KEY_FRAME) {
-    cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+    cpi->twopass.kf_group_bits += cpi->this_frame_target -
+                                  cpi->projected_frame_size;
 
     cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
   } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
-    cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+    cpi->twopass.gf_group_bits += cpi->this_frame_target -
+                                  cpi->projected_frame_size;
 
     cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
   }
-
-  // Update the skip mb flag probabilities based on the distribution seen
-  // in this frame.
-  // update_base_skip_probs(cpi);
-
-#if CONFIG_INTERNAL_STATS
-  {
-    FILE *f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
-    int recon_err;
-
-    vp9_clear_system_state();  // __asm emms;
-
-    recon_err = vp9_calc_ss_err(cpi->Source,
-                                &cm->yv12_fb[cm->new_fb_idx]);
-
-    if (cpi->twopass.total_left_stats.coded_error != 0.0)
-      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
-              "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
-              "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
-              "%10.3f %8d %10d %10d %10d\n",
-              cpi->common.current_video_frame, cpi->this_frame_target,
-              cpi->projected_frame_size, 0, //loop_size_estimate,
-              (cpi->projected_frame_size - cpi->this_frame_target),
-              (int)cpi->total_target_vs_actual,
-              (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
-              (int)cpi->total_actual_bits,
-              cm->base_qindex,
-              vp9_convert_qindex_to_q(cm->base_qindex),
-              (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
-              vp9_convert_qindex_to_q(cpi->active_best_quality),
-              vp9_convert_qindex_to_q(cpi->active_worst_quality),
-              cpi->avg_q,
-              vp9_convert_qindex_to_q(cpi->ni_av_qi),
-              vp9_convert_qindex_to_q(cpi->cq_target_quality),
-              cpi->refresh_last_frame,
-              cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
-              cm->frame_type, cpi->gfu_boost,
-              cpi->twopass.est_max_qcorrection_factor,
-              (int)cpi->twopass.bits_left,
-              cpi->twopass.total_left_stats.coded_error,
-              (double)cpi->twopass.bits_left /
-              cpi->twopass.total_left_stats.coded_error,
-              cpi->tot_recode_hits, recon_err, cpi->kf_boost,
-              cpi->kf_zeromotion_pct);
-    else
-      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
-              "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
-              "%5d %5d %5d %8d %8d %8.2f %10d %10.3f"
-              "%8d %10d %10d %10d\n",
-              cpi->common.current_video_frame,
-              cpi->this_frame_target, cpi->projected_frame_size,
-              0, //loop_size_estimate,
-              (cpi->projected_frame_size - cpi->this_frame_target),
-              (int)cpi->total_target_vs_actual,
-              (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
-              (int)cpi->total_actual_bits,
-              cm->base_qindex,
-              vp9_convert_qindex_to_q(cm->base_qindex),
-              (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
-              vp9_convert_qindex_to_q(cpi->active_best_quality),
-              vp9_convert_qindex_to_q(cpi->active_worst_quality),
-              cpi->avg_q,
-              vp9_convert_qindex_to_q(cpi->ni_av_qi),
-              vp9_convert_qindex_to_q(cpi->cq_target_quality),
-              cpi->refresh_last_frame,
-              cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
-              cm->frame_type, cpi->gfu_boost,
-              cpi->twopass.est_max_qcorrection_factor,
-              (int)cpi->twopass.bits_left,
-              cpi->twopass.total_left_stats.coded_error,
-              cpi->tot_recode_hits, recon_err, cpi->kf_boost,
-              cpi->kf_zeromotion_pct);
-
-    fclose(f);
-
-    if (0) {
-      FILE *fmodes = fopen("Modes.stt", "a");
-      int i;
-
-      fprintf(fmodes, "%6d:%1d:%1d:%1d ",
-              cpi->common.current_video_frame,
-              cm->frame_type, cpi->refresh_golden_frame,
-              cpi->refresh_alt_ref_frame);
-
-      for (i = 0; i < MAX_MODES; i++)
-        fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-
-      fprintf(fmodes, "\n");
-
-      fclose(fmodes);
-    }
-  }
-
 #endif
 
 #if 0
-  // Debug stats for segment feature experiments.
-  print_seg_map(cpi);
+  output_frame_level_debug_stats(cpi);
 #endif
-
-  // If this was a kf or Gf note the Q
-  if ((cm->frame_type == KEY_FRAME)
-      || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
-    cm->last_kf_gf_q = cm->base_qindex;
-
   if (cpi->refresh_golden_frame == 1)
     cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
   else
@@ -3463,7 +3576,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 #endif
   }
 
-  // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
+  // Clear the one shot update flags for segmentation map and mode/ref loop
+  // filter deltas.
   cm->seg.update_map = 0;
   cm->seg.update_data = 0;
   cm->lf.mode_ref_delta_update = 0;
@@ -3487,6 +3601,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     cm->mi = cm->mip + cm->mode_info_stride + 1;
     cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1;
 
+    cpi->mb.e_mbd.mi_8x8 = cm->mi_grid_visible;
+    cpi->mb.e_mbd.mi_8x8[0] = cm->mi;
+
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
     ++cm->current_video_frame;
@@ -3495,28 +3612,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // restore prev_mi
   cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
   cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
-
-  #if 0
-  {
-    char filename[512];
-    FILE *recon_file;
-    sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
-    recon_file = fopen(filename, "wb");
-    fwrite(cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].buffer_alloc,
-           cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].frame_size,
-           1, recon_file);
-    fclose(recon_file);
-  }
-#endif
-#ifdef OUTPUT_YUV_REC
-  vp9_write_yuv_rec_frame(cm);
-#endif
-
 }
 
 static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
                         unsigned char *dest, unsigned int *frame_flags) {
-
   cpi->enable_encode_breakout = 1;
 
   if (!cpi->refresh_alt_ref_frame)
@@ -3533,12 +3632,14 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
   if (!cpi->refresh_alt_ref_frame) {
     double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
     double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
-                                        * cpi->oxcf.two_pass_vbrmin_section / 100);
+                                        * cpi->oxcf.two_pass_vbrmin_section
+                                        / 100);
 
     if (two_pass_min_rate < lower_bounds_min_rate)
       two_pass_min_rate = lower_bounds_min_rate;
 
-    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.framerate);
+    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate
+                              / cpi->oxcf.framerate);
   }
 }
 
@@ -3612,8 +3713,8 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 
   cpi->source = NULL;
 
-  cpi->mb.e_mbd.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;
-  set_mvcost(&cpi->mb);
+  cpi->common.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;
+  set_mvcost(cpi);
 
   // Should we code an alternate reference frame.
   if (cpi->oxcf.play_alternate && cpi->source_alt_ref_pending) {
@@ -3651,7 +3752,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
       }
 
       cm->show_frame = 0;
-      cm->intra_only = 0;
       cpi->refresh_alt_ref_frame = 1;
       cpi->refresh_golden_frame = 0;
       cpi->refresh_last_frame = 0;
@@ -3673,6 +3773,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 #endif
     if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {
       cm->show_frame = 1;
+      cm->intra_only = 0;
 
 #if CONFIG_MULTIPLE_ARF
       // Is this frame the ARF overlay.
@@ -3829,7 +3930,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   cm->frame_flags = *frame_flags;
 
   // Reset the frame pointers to the current frame size
-  vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
+  vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
                            cm->width, cm->height,
                            cm->subsampling_x, cm->subsampling_y,
                            VP9BORDERINPIXELS);
@@ -3840,6 +3941,10 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 
   vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
 
+  if (cpi->sf.variance_adaptive_quantization) {
+      vp9_vaq_init();
+  }
+
   if (cpi->pass == 1) {
     Pass1Encode(cpi, size, dest, frame_flags);
   } else if (cpi->pass == 2) {
@@ -3876,7 +3981,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
     cpi->bytes += *size;
 
     if (cm->show_frame) {
-
       cpi->count++;
 
       if (cpi->b_calculate_psnr) {
@@ -3986,9 +4090,9 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
                               vp9_ppflags_t *flags) {
   VP9_COMP *cpi = (VP9_COMP *) comp;
 
-  if (!cpi->common.show_frame)
+  if (!cpi->common.show_frame) {
     return -1;
-  else {
+  } else {
     int ret;
 #if CONFIG_VP9_POSTPROC
     ret = vp9_post_proc_frame(&cpi->common, dest, flags);
@@ -4138,37 +4242,9 @@ int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
   return 0;
 }
 
-int vp9_switch_layer(VP9_PTR comp, int layer) {
-  VP9_COMP *cpi = (VP9_COMP *)comp;
-
-  if (cpi->use_svc) {
-    cpi->current_layer = layer;
-
-    // Use buffer i for layer i LST
-    cpi->lst_fb_idx = layer;
-
-    // Use buffer i-1 for layer i Alt (Inter-layer prediction)
-    if (layer != 0) cpi->alt_fb_idx = layer - 1;
-
-    // Use the rest for Golden
-    if (layer < 2 * cpi->number_spatial_layers - NUM_REF_FRAMES)
-      cpi->gld_fb_idx = cpi->lst_fb_idx;
-    else
-      cpi->gld_fb_idx = 2 * cpi->number_spatial_layers - 1 - layer;
-
-    printf("Switching to layer %d:\n", layer);
-    printf("Using references: LST/GLD/ALT [%d|%d|%d]\n", cpi->lst_fb_idx,
-           cpi->gld_fb_idx, cpi->alt_fb_idx);
-  } else {
-    printf("Switching layer not supported. Enable SVC first \n");
-  }
-  return 0;
-}
-
 void vp9_set_svc(VP9_PTR comp, int use_svc) {
   VP9_COMP *cpi = (VP9_COMP *)comp;
   cpi->use_svc = use_svc;
-  if (cpi->use_svc) printf("Enabled SVC encoder \n");
   return;
 }
 
diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h
index 3e5796f..9429c7f 100644
--- a/libvpx/vp9/encoder/vp9_onyx_int.h
+++ b/libvpx/vp9/encoder/vp9_onyx_int.h
@@ -29,12 +29,7 @@
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/encoder/vp9_lookahead.h"
 
-// Experimental rate control switches
-#if CONFIG_ONESHOTQ
-#define ONE_SHOT_Q_ESTIMATE 0
-#define STRICT_ONE_SHOT_Q 0
 #define DISABLE_RC_LONG_TERM_MEM 0
-#endif
 
 // #define MODE_TEST_HIT_STATS
 
@@ -49,7 +44,8 @@
 
 #define KEY_FRAME_CONTEXT 5
 
-#define MAX_MODES 36
+#define MAX_MODES 30
+#define MAX_REFS  6
 
 #define MIN_THRESHMULT  32
 #define MAX_THRESHMULT  512
@@ -61,16 +57,11 @@
 #define INTRA_ZBIN_BOOST     0
 
 typedef struct {
-  nmv_context nmvc;
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
   int nmvcosts_hp[2][MV_VALS];
 
   vp9_prob segment_pred_probs[PREDICTION_PROBS];
-  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
-  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
-  vp9_prob single_ref_prob[REF_CONTEXTS][2];
-  vp9_prob comp_ref_prob[REF_CONTEXTS];
 
   unsigned char *last_frame_seg_map_copy;
 
@@ -79,20 +70,8 @@ typedef struct {
   // 0 = ZERO_MV, MV
   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
 
-  vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
-
-  vp9_prob y_mode_prob[4][INTRA_MODES - 1];
-  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-  vp9_prob partition_prob[2][NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
-
-  vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
-                                 [SWITCHABLE_FILTERS - 1];
-
   int inter_mode_counts[INTER_MODE_CONTEXTS][INTER_MODES - 1][2];
-  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
-
-  struct tx_probs tx_probs;
-  vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
+  FRAME_CONTEXT fc;
 } CODING_CONTEXT;
 
 typedef struct {
@@ -169,19 +148,12 @@ typedef enum {
   THR_COMP_NEARGA,
   THR_COMP_NEWGA,
 
-  THR_SPLITMV,
-  THR_SPLITG,
-  THR_SPLITA,
-  THR_COMP_SPLITLA,
-  THR_COMP_SPLITGA,
-
   THR_ZEROMV,
   THR_ZEROG,
   THR_ZEROA,
   THR_COMP_ZEROLA,
   THR_COMP_ZEROGA,
 
-  THR_B_PRED,
   THR_H_PRED,
   THR_V_PRED,
   THR_D135_PRED,
@@ -193,6 +165,15 @@ typedef enum {
 } THR_MODES;
 
 typedef enum {
+  THR_LAST,
+  THR_GOLD,
+  THR_ALTR,
+  THR_COMP_LA,
+  THR_COMP_GA,
+  THR_INTRA,
+} THR_MODES_SUB8X8;
+
+typedef enum {
   DIAMOND = 0,
   NSTEP = 1,
   HEX = 2,
@@ -244,8 +225,15 @@ typedef enum {
 #define ALL_INTRA_MODES 0x3FF
 #define INTRA_DC_ONLY 0x01
 #define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED))
+#define INTRA_DC_H_V ((1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED))
 #define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED))
 
+typedef enum {
+  LAST_FRAME_PARTITION_OFF = 0,
+  LAST_FRAME_PARTITION_LOW_MOTION = 1,
+  LAST_FRAME_PARTITION_ALL = 2
+} LAST_FRAME_PARTITION_METHOD;
+
 typedef struct {
   int RD;
   SEARCH_METHODS search_method;
@@ -254,21 +242,21 @@ typedef struct {
   SUBPEL_SEARCH_METHODS subpel_search_method;
   int subpel_iters_per_step;
   int thresh_mult[MAX_MODES];
+  int thresh_mult_sub8x8[MAX_REFS];
   int max_step_search_steps;
   int reduce_first_step_size;
   int auto_mv_step_size;
   int optimize_coefficients;
   int static_segmentation;
+  int variance_adaptive_quantization;
   int comp_inter_joint_search_thresh;
   int adaptive_rd_thresh;
   int skip_encode_sb;
   int skip_encode_frame;
-  int use_lastframe_partitioning;
+  LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
   int use_lp32x32fdct;
   int use_avoid_tested_higherror;
-  int skip_lots_of_modes;
-  int partition_by_variance;
   int use_one_partition_size_always;
   int less_rectangular_check;
   int use_square_partition_only;
@@ -276,13 +264,11 @@ typedef struct {
   int reference_masking;
   BLOCK_SIZE always_this_block_size;
   int auto_min_max_partition_size;
-  int auto_min_max_partition_interval;
-  int auto_min_max_partition_count;
   BLOCK_SIZE min_partition_size;
   BLOCK_SIZE max_partition_size;
   int adjust_partitioning_from_last_frame;
   int last_partitioning_redo_frequency;
-  int disable_splitmv;
+  int disable_split_mask;
   int using_small_partition_info;
   // TODO(jingning): combine the related motion search speed features
   int adaptive_motion_search;
@@ -296,8 +282,8 @@ typedef struct {
   // A source variance threshold below which filter search is disabled
   // Choose a very large value (UINT_MAX) to use 8-tap always
   unsigned int disable_filter_search_var_thresh;
-  int intra_y_mode_mask;
-  int intra_uv_mode_mask;
+  int intra_y_mode_mask[TX_SIZES];
+  int intra_uv_mode_mask[TX_SIZES];
   int use_rd_breakout;
   int use_uv_intra_rd_estimate;
   int use_fast_lpf_pick;
@@ -325,6 +311,7 @@ typedef struct VP9_COMP {
   MACROBLOCK mb;
   VP9_COMMON common;
   VP9_CONFIG oxcf;
+  struct rdcost_block_args rdcost_stack;
 
   struct lookahead_ctx    *lookahead;
   struct lookahead_entry  *source;
@@ -339,13 +326,13 @@ typedef struct VP9_COMP {
   YV12_BUFFER_CONFIG scaled_source;
 
   unsigned int frames_till_alt_ref_frame;
-  int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
-  int source_alt_ref_active;  // an alt ref frame has been encoded and is usable
+  int source_alt_ref_pending;
+  int source_alt_ref_active;
 
-  int is_src_frame_alt_ref;   // source of frame to encode is an exact copy of an alt ref frame
+  int is_src_frame_alt_ref;
 
-  int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
-  int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)
+  int gold_is_last;  // gold same as last frame ( short circuit gold searches)
+  int alt_is_last;  // Alt same as last ( short circuit altref search)
   int gold_is_alt;  // don't do both alt and gold search ( just do gold).
 
   int scaled_ref_idx[3];
@@ -382,19 +369,19 @@ typedef struct VP9_COMP {
   // Ambient reconstruction err target for force key frames
   int ambient_err;
 
-  unsigned int mode_check_freq[MAX_MODES];
-  unsigned int mode_test_hit_counts[MAX_MODES];
   unsigned int mode_chosen_counts[MAX_MODES];
+  unsigned int sub8x8_mode_chosen_counts[MAX_REFS];
   int64_t mode_skip_mask;
   int ref_frame_mask;
   int set_ref_frame_mask;
 
-  int rd_threshes[BLOCK_SIZES][MAX_MODES];
+  int rd_threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
   int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+  int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS];
+  int rd_thresh_freq_sub8x8[BLOCK_SIZES][MAX_REFS];
 
   int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
-  // FIXME(rbultje) int64_t?
-  int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
+  int64_t rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
   unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];
   unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];
   unsigned int single_ref_count[REF_CONTEXTS][2][2];
@@ -404,9 +391,9 @@ typedef struct VP9_COMP {
   // FIXME(rbultje) can this overflow?
   int rd_tx_select_threshes[4][TX_MODES];
 
-  int64_t rd_filter_diff[SWITCHABLE_FILTERS + 1];
-  int64_t rd_filter_threshes[4][SWITCHABLE_FILTERS + 1];
-  int64_t rd_filter_cache[SWITCHABLE_FILTERS + 1];
+  int64_t rd_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t rd_filter_threshes[4][SWITCHABLE_FILTER_CONTEXTS];
+  int64_t rd_filter_cache[SWITCHABLE_FILTER_CONTEXTS];
 
   int RDMULT;
   int RDDIV;
@@ -424,14 +411,14 @@ typedef struct VP9_COMP {
   double gf_rate_correction_factor;
 
   unsigned int frames_since_golden;
-  int frames_till_gf_update_due;      // Count down till next GF
+  int frames_till_gf_update_due;  // Count down till next GF
 
-  int gf_overspend_bits;            // Total bits overspent becasue of GF boost (cumulative)
+  int gf_overspend_bits;  // cumulative bits overspent because of GF boost
 
-  int non_gf_bitrate_adjustment;     // Used in the few frames following a GF to recover the extra bits spent in that GF
+  int non_gf_bitrate_adjustment;  // Following GF to recover extra bits spent
 
-  int kf_overspend_bits;            // Extra bits spent on key frames that need to be recovered on inter frames
-  int kf_bitrate_adjustment;        // Current number of bit s to try and recover on each inter frame.
+  int kf_overspend_bits;  // Bits spent on key frames to be recovered on inters
+  int kf_bitrate_adjustment;  // number of bits to recover on each inter frame.
   int max_gf_interval;
   int baseline_gf_interval;
   int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
@@ -439,9 +426,9 @@ typedef struct VP9_COMP {
 
   int64_t key_frame_count;
   int prior_key_frame_distance[KEY_FRAME_CONTEXT];
-  int per_frame_bandwidth;          // Current section per frame bandwidth target
-  int av_per_frame_bandwidth;        // Average frame size target for clip
-  int min_frame_bandwidth;          // Minimum allocation that should be used for any frame
+  int per_frame_bandwidth;  // Current section per frame bandwidth target
+  int av_per_frame_bandwidth;  // Average frame size target for clip
+  int min_frame_bandwidth;  // Minimum allocation used for any frame
   int inter_frame_target;
   double output_framerate;
   int64_t last_time_stamp_seen;
@@ -483,7 +470,7 @@ typedef struct VP9_COMP {
 
   int y_mode_count[4][INTRA_MODES];
   int y_uv_mode_count[INTRA_MODES][INTRA_MODES];
-  unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
+  unsigned int partition_count[PARTITION_CONTEXTS][PARTITION_TYPES];
 
   nmv_context_counts NMVcount;
 
@@ -514,14 +501,9 @@ typedef struct VP9_COMP {
   int decimation_count;
 
   // for real time encoding
-  int avg_encode_time;              // microsecond
-  int avg_pick_mode_time;            // microsecond
   int speed;
-  unsigned int cpu_freq;           // Mhz
   int compressor_speed;
 
-  int interquantizer;
-  int goldfreq;
   int auto_worst_q;
   int cpu_used;
   int pass;
@@ -537,11 +519,6 @@ typedef struct VP9_COMP {
   unsigned int max_mv_magnitude;
   int mv_step_param;
 
-  // Data used for real time conferencing mode to help determine if it would be good to update the gf
-  int inter_zz_count;
-  int gf_bad_count;
-  int gf_update_recommended;
-
   unsigned char *segmentation_map;
 
   // segment threashold for encode breakout
@@ -648,10 +625,10 @@ typedef struct VP9_COMP {
 
   int dummy_packing;    /* flag to indicate if packing is dummy */
 
-  unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1]
+  unsigned int switchable_interp_count[SWITCHABLE_FILTER_CONTEXTS]
                                       [SWITCHABLE_FILTERS];
 
-  unsigned int txfm_stepdown_count[TX_SIZES];
+  unsigned int tx_stepdown_count[TX_SIZES];
 
   int initial_width;
   int initial_height;
@@ -682,6 +659,13 @@ typedef struct VP9_COMP {
   // Debug / test stats
   int64_t mode_test_hits[BLOCK_SIZES];
 #endif
+
+  /* Y,U,V,(A) */
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[8];
 } VP9_COMP;
 
 static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
@@ -714,9 +698,14 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x);
 
 void vp9_set_speed_features(VP9_COMP *cpi);
 
-extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source,
-                           YV12_BUFFER_CONFIG *dest);
+int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
+
+void vp9_alloc_compressor_data(VP9_COMP *cpi);
 
-extern void vp9_alloc_compressor_data(VP9_COMP *cpi);
+int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget);
+
+static int get_token_alloc(int mb_rows, int mb_cols) {
+  return mb_rows * mb_cols * (48 * 16 + 4);
+}
 
 #endif  // VP9_ENCODER_VP9_ONYX_INT_H_
diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c
index 239fd6b..476ecaa 100644
--- a/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/libvpx/vp9/encoder/vp9_picklpf.c
@@ -54,7 +54,8 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
   src += srcoffset;
   dst += dstoffset;
 
-  // Loop through the Y plane raw and reconstruction data summing (square differences)
+  // Loop through the raw Y plane and reconstruction data summing the square
+  // differences.
   for (i = 0; i < linestocopy; i += 16) {
     for (j = 0; j < source->y_width; j += 16) {
       unsigned int sse;
@@ -72,20 +73,6 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
 // Enforce a minimum filter level based upon baseline Q
 static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
   int min_filter_level;
-  /*int q = (int) vp9_convert_qindex_to_q(base_qindex);
-
-  if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame)
-      min_filter_level = 0;
-  else
-  {
-      if (q <= 10)
-          min_filter_level = 0;
-      else if (q <= 64)
-          min_filter_level = 1;
-      else
-          min_filter_level = (q >> 6);
-  }
-  */
   min_filter_level = 0;
 
   return min_filter_level;
@@ -93,11 +80,7 @@ static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
 
 // Enforce a maximum filter level based upon baseline Q
 static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {
-  // PGW August 2006: Highest filter values almost always a bad idea
-
-  // jbb chg: 20100118 - not so any more with this overquant stuff allow high values
-  // with lots of intra coming in.
-  int max_filter_level = MAX_LOOP_FILTER;// * 3 / 4;
+  int max_filter_level = MAX_LOOP_FILTER;
   (void)base_qindex;
 
   if (cpi->twopass.section_intra_rating > 8)
@@ -128,7 +111,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
   int filt_best;
   int filt_direction = 0;
 
-  int Bias = 0;                       // Bias against raising loop filter and in favour of lowering it
+  int Bias = 0;  // Bias against raising loop filter in favor of lowering it.
 
   //  Make a copy of the unfiltered / processed recon buffer
   vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
@@ -136,7 +119,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
   lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
                                                     : cpi->oxcf.Sharpness;
 
-  // Start the search at the previous frame filter level unless it is now out of range.
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
   filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
 
   // Define the initial step size
@@ -153,9 +137,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
   vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
 
   while (filter_step > 0) {
-    Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images
+    Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
-    // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value
     if (cpi->twopass.section_intra_rating < 20)
       Bias = Bias * cpi->twopass.section_intra_rating / 20;
 
@@ -163,8 +146,12 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
     if (cpi->common.tx_mode != ONLY_4X4)
       Bias >>= 1;
 
-    filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
-    filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
+    filt_high = ((filt_mid + filter_step) > max_filter_level)
+                    ? max_filter_level
+                    : (filt_mid + filter_step);
+    filt_low = ((filt_mid - filter_step) < min_filter_level)
+                   ? min_filter_level
+                   : (filt_mid - filter_step);
 
     if ((filt_direction <= 0) && (filt_low != filt_mid)) {
       // Get Low filter error score
@@ -176,7 +163,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
       //  Re-instate the unfiltered frame
       vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
 
-      // If value is close to the best so far then bias towards a lower loop filter value.
+      // If value is close to the best so far then bias towards a lower loop
+      // filter value.
       if ((filt_err - Bias) < best_err) {
         // Was it actually better than the previous best?
         if (filt_err < best_err)
@@ -215,4 +203,3 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
 
   lf->filter_level = filt_best;
 }
-
diff --git a/libvpx/vp9/encoder/vp9_psnr.c b/libvpx/vp9/encoder/vp9_psnr.c
index 9439434..58294e1 100644
--- a/libvpx/vp9/encoder/vp9_psnr.c
+++ b/libvpx/vp9/encoder/vp9_psnr.c
@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
 
 #include "vpx_scale/yv12config.h"
-#include "math.h"
 
 #define MAX_PSNR 100
 
diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c
index 6c8b2a0..fca7525 100644
--- a/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libvpx/vp9/encoder/vp9_quantize.c
@@ -12,6 +12,7 @@
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/common/vp9_quant_common.h"
 
@@ -21,12 +22,14 @@
 extern int enc_debug;
 #endif
 
-void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-                      int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
-                      int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                      int16_t *dqcoeff_ptr, int16_t *dequant_ptr,
-                      int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
+void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr,
+                      int zbin_oq_value, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
   int i, rc, eob;
   int zbins[2], nzbins[2], zbin;
   int x, y, z, sz;
@@ -85,14 +88,15 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
   *eob_ptr = eob + 1;
 }
 
-void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
                             int skip_block,
-                            int16_t *zbin_ptr, int16_t *round_ptr,
-                            int16_t *quant_ptr, int16_t *quant_shift_ptr,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                            int16_t *dequant_ptr, int zbin_oq_value,
-                            uint16_t *eob_ptr, const int16_t *scan,
-                            const int16_t *iscan) {
+                            const int16_t *dequant_ptr,
+                            int zbin_oq_value, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
   int i, rc, eob;
   int zbins[2], nzbins[2];
   int x, y, z, sz;
@@ -173,25 +177,19 @@ static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
   return res;
 }
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                int y_blocks) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+                                const int16_t *scan, const int16_t *iscan) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  const int16_t *scan = get_scan_4x4(tx_type);
-  const int16_t *iscan = get_iscan_4x4(tx_type);
-
-  vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block),
-           16, mb->skip_block,
-           mb->plane[pb_idx.plane].zbin,
-           mb->plane[pb_idx.plane].round,
-           mb->plane[pb_idx.plane].quant,
-           mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block),
-           xd->plane[pb_idx.plane].dequant,
-           mb->plane[pb_idx.plane].zbin_extra,
-           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           scan, iscan);
+  struct macroblock_plane* p = &x->plane[pb_idx.plane];
+  struct macroblockd_plane* pd = &xd->plane[pb_idx.plane];
+
+  vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block),
+           16, x->skip_block,
+           p->zbin, p->round, p->quant, p->quant_shift,
+           BLOCK_OFFSET(pd->qcoeff, pb_idx.block),
+           BLOCK_OFFSET(pd->dqcoeff, pb_idx.block),
+           pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
@@ -271,12 +269,15 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
 
 void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
   int i;
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   int zbin_extra;
-  int segment_id = xd->this_mi->mbmi.segment_id;
+  int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
   const int qindex = vp9_get_qindex(&cpi->common.seg, segment_id,
                                     cpi->common.base_qindex);
 
+  int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
+
   // Y
   zbin_extra = (cpi->common.y_dequant[qindex][1] *
                  (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
@@ -315,6 +316,12 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
 
   /* save this macroblock QIndex for vp9_update_zbin_extra() */
   x->e_mbd.q_index = qindex;
+
+  /* R/D setup */
+  cpi->mb.errorperbit = rdmult >> 6;
+  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
+
+  vp9_initialize_me_consts(cpi, xd->q_index);
 }
 
 void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
@@ -337,10 +344,10 @@ void vp9_frame_init_quantizer(VP9_COMP *cpi) {
   vp9_mb_init_quantizer(cpi, &cpi->mb);
 }
 
-void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) {
+void vp9_set_quantizer(struct VP9_COMP *cpi, int q) {
   VP9_COMMON *cm = &cpi->common;
 
-  cm->base_qindex = Q;
+  cm->base_qindex = q;
 
   // if any of the delta_q values are changing update flag will
   // have to be set.
diff --git a/libvpx/vp9/encoder/vp9_quantize.h b/libvpx/vp9/encoder/vp9_quantize.h
index 3229eaa..c078e1d 100644
--- a/libvpx/vp9/encoder/vp9_quantize.h
+++ b/libvpx/vp9/encoder/vp9_quantize.h
@@ -13,31 +13,19 @@
 
 #include "vp9/encoder/vp9_block.h"
 
-#define prototype_quantize_block(sym) \
-  void (sym)(MACROBLOCK *mb, int b_idx)
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+                                const int16_t *scan, const int16_t *iscan);
 
-#define prototype_quantize_block_pair(sym) \
-  void (sym)(MACROBLOCK *mb, int b_idx1, int b_idx2)
-
-#define prototype_quantize_mb(sym) \
-  void (sym)(MACROBLOCK *x)
-
-void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,
-                                     int y_blocks);
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                int y_blocks);
-void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                int y_blocks);
 struct VP9_COMP;
 
-extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
+void vp9_set_quantizer(struct VP9_COMP *cpi, int q);
 
-extern void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
+void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
 
-extern void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
+void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
 
-extern void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);
+void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);
 
-extern void vp9_init_quantizer(struct VP9_COMP *cpi);
+void vp9_init_quantizer(struct VP9_COMP *cpi);
 
 #endif  // VP9_ENCODER_VP9_QUANTIZE_H_
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c
index 2d12ba9..0aa3a68 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -59,9 +59,8 @@ static int kfboost_qadjust(int qindex) {
 
 int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
                     double correction_factor) {
-
   const double q = vp9_convert_qindex_to_q(qindex);
-  int enumerator = frame_type == KEY_FRAME ? 4000000 : 2500000;
+  int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000;
 
   // q based adjustment to baseline enumerator
   enumerator += (int)(enumerator * q) >> 12;
@@ -76,35 +75,19 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
   // restored with a call to vp9_restore_coding_context. These functions are
   // intended for use in a re-code loop in vp9_compress_frame where the
   // quantizer value is adjusted between loop iterations.
-
-  cc->nmvc = cm->fc.nmvc;
   vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
   vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);
   vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);
 
-  vp9_copy(cc->inter_mode_probs, cm->fc.inter_mode_probs);
-
-  vp9_copy(cc->y_mode_prob, cm->fc.y_mode_prob);
-  vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
-  vp9_copy(cc->partition_prob, cm->fc.partition_prob);
-
   vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
 
-  vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob);
-  vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob);
-  vp9_copy(cc->single_ref_prob, cm->fc.single_ref_prob);
-  vp9_copy(cc->comp_ref_prob, cm->fc.comp_ref_prob);
-
   vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
              cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
 
   vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
   vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
 
-  vp9_copy(cc->coef_probs, cm->fc.coef_probs);
-  vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
-  cc->tx_probs = cm->fc.tx_probs;
-  vp9_copy(cc->mbskip_probs, cm->fc.mbskip_probs);
+  cc->fc = cm->fc;
 }
 
 void vp9_restore_coding_context(VP9_COMP *cpi) {
@@ -113,25 +96,12 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
 
   // Restore key state variables to the snapshot state stored in the
   // previous call to vp9_save_coding_context.
-
-  cm->fc.nmvc = cc->nmvc;
   vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
   vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
   vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
 
-  vp9_copy(cm->fc.inter_mode_probs, cc->inter_mode_probs);
-
-  vp9_copy(cm->fc.y_mode_prob, cc->y_mode_prob);
-  vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
-  vp9_copy(cm->fc.partition_prob, cc->partition_prob);
-
   vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
 
-  vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob);
-  vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob);
-  vp9_copy(cm->fc.single_ref_prob, cc->single_ref_prob);
-  vp9_copy(cm->fc.comp_ref_prob, cc->comp_ref_prob);
-
   vpx_memcpy(cm->last_frame_seg_map,
              cpi->coding_context.last_frame_seg_map_copy,
              (cm->mi_rows * cm->mi_cols));
@@ -139,10 +109,7 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
   vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
   vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
 
-  vp9_copy(cm->fc.coef_probs, cc->coef_probs);
-  vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
-  cm->fc.tx_probs = cc->tx_probs;
-  vp9_copy(cm->fc.mbskip_probs, cc->mbskip_probs);
+  cm->fc = cc->fc;
 }
 
 void vp9_setup_key_frame(VP9_COMP *cpi) {
@@ -224,11 +191,12 @@ static void calc_pframe_target_size(VP9_COMP *cpi) {
     cpi->this_frame_target = cpi->per_frame_bandwidth;
   }
 
-  // Sanity check that the total sum of adjustments is not above the maximum allowed
-  // That is that having allowed for KF and GF penalties we have not pushed the
-  // current interframe target to low. If the adjustment we apply here is not capable of recovering
-  // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over
-  // a longer time span via other buffer / rate control mechanisms.
+  // Check that the total sum of adjustments is not above the maximum allowed.
+  // That is, having allowed for the KF and GF penalties, we have not pushed
+  // the current inter-frame target too low. If the adjustment we apply here is
+  // not capable of recovering all the extra bits we have spent in the KF or GF,
+  // then the remainder will have to be recovered over a longer time span via
+  // other buffer / rate control mechanisms.
   if (cpi->this_frame_target < min_frame_target)
     cpi->this_frame_target = min_frame_target;
 
@@ -297,12 +265,12 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
                                                  rate_correction_factor);
 
   // Work out a size correction factor.
-  // if ( cpi->this_frame_target > 0 )
-  //  correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;
   if (projected_size_based_on_q > 0)
-    correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+    correction_factor =
+        (100 * cpi->projected_frame_size) / projected_size_based_on_q;
 
-  // More heavily damped adjustment used if we have been oscillating either side of target
+  // More heavily damped adjustment used if we have been oscillating either side
+  // of target.
   switch (damp_var) {
     case 0:
       adjustment_limit = 0.75;
@@ -319,27 +287,29 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   // if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )
   if (correction_factor > 102) {
     // We are not already at the worst allowable quality
-    correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
-    rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+    correction_factor =
+        (int)(100 + ((correction_factor - 100) * adjustment_limit));
+    rate_correction_factor =
+        ((rate_correction_factor * correction_factor) / 100);
 
     // Keep rate_correction_factor within limits
     if (rate_correction_factor > MAX_BPB_FACTOR)
       rate_correction_factor = MAX_BPB_FACTOR;
-  }
-  // else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) )
-  else if (correction_factor < 99) {
+  } else if (correction_factor < 99) {
     // We are not already at the best allowable quality
-    correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
-    rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+    correction_factor =
+        (int)(100 - ((100 - correction_factor) * adjustment_limit));
+    rate_correction_factor =
+        ((rate_correction_factor * correction_factor) / 100);
 
     // Keep rate_correction_factor within limits
     if (rate_correction_factor < MIN_BPB_FACTOR)
       rate_correction_factor = MIN_BPB_FACTOR;
   }
 
-  if (cpi->common.frame_type == KEY_FRAME)
+  if (cpi->common.frame_type == KEY_FRAME) {
     cpi->key_frame_rate_correction_factor = rate_correction_factor;
-  else {
+  } else {
     if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
       cpi->gf_rate_correction_factor = rate_correction_factor;
     else
@@ -358,20 +328,24 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
   double correction_factor;
 
   // Select the appropriate correction factor based upon type of frame.
-  if (cpi->common.frame_type == KEY_FRAME)
+  if (cpi->common.frame_type == KEY_FRAME) {
     correction_factor = cpi->key_frame_rate_correction_factor;
-  else {
+  } else {
     if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
       correction_factor = cpi->gf_rate_correction_factor;
     else
       correction_factor = cpi->rate_correction_factor;
   }
 
-  // Calculate required scaling factor based on target frame size and size of frame produced using previous Q
+  // Calculate required scaling factor based on target frame size and size of
+  // frame produced using previous Q.
   if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
-    target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS;       // Case where we would overflow int
+    target_bits_per_mb =
+        (target_bits_per_frame / cpi->common.MBs)
+        << BPER_MB_NORMBITS;  // Case where we would overflow int
   else
-    target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
+    target_bits_per_mb =
+        (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
 
   i = cpi->active_best_quality;
 
@@ -437,7 +411,6 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) {
     }
 
     av_key_frame_frequency /= total_weight;
-
   }
   return av_key_frame_frequency;
 }
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.h b/libvpx/vp9/encoder/vp9_ratectrl.h
index 4733176..ddda713 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -32,8 +32,8 @@ int vp9_pick_frame_size(VP9_COMP *cpi);
 
 double vp9_convert_qindex_to_q(int qindex);
 int vp9_gfboost_qadjust(int qindex);
-extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                           double correction_factor);
+int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                    double correction_factor);
 void vp9_setup_inter_frame(VP9_COMP *cpi);
 
 #endif  // VP9_ENCODER_VP9_RATECTRL_H_
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index df00334..993919e 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -36,7 +36,7 @@
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_common.h"
 
@@ -45,58 +45,59 @@
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
-DECLARE_ALIGNED(16, extern const uint8_t,
-                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+#define LAST_FRAME_MODE_MASK    0xFFEDCD60
+#define GOLDEN_FRAME_MODE_MASK  0xFFDA3BB0
+#define ALT_REF_MODE_MASK       0xFFC648D0
 
-#define LAST_FRAME_MODE_MASK    0xFFDADCD60
-#define GOLDEN_FRAME_MODE_MASK  0xFFB5A3BB0
-#define ALT_REF_MODE_MASK       0xFF8C648D0
+#define MIN_EARLY_TERM_INDEX    3
 
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  {RD_NEARESTMV, LAST_FRAME,   NONE},
-  {RD_NEARESTMV, ALTREF_FRAME, NONE},
-  {RD_NEARESTMV, GOLDEN_FRAME, NONE},
-
-  {RD_DC_PRED,   INTRA_FRAME,  NONE},
-
-  {RD_NEWMV,     LAST_FRAME,   NONE},
-  {RD_NEWMV,     ALTREF_FRAME, NONE},
-  {RD_NEWMV,     GOLDEN_FRAME, NONE},
-
-  {RD_NEARMV,    LAST_FRAME,   NONE},
-  {RD_NEARMV,    ALTREF_FRAME, NONE},
-  {RD_NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
-  {RD_NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
-
-  {RD_TM_PRED,   INTRA_FRAME,  NONE},
-
-  {RD_NEARMV,    LAST_FRAME,   ALTREF_FRAME},
-  {RD_NEWMV,     LAST_FRAME,   ALTREF_FRAME},
-  {RD_NEARMV,    GOLDEN_FRAME, NONE},
-  {RD_NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
-  {RD_NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
-
-  {RD_SPLITMV,   LAST_FRAME,   NONE},
-  {RD_SPLITMV,   GOLDEN_FRAME, NONE},
-  {RD_SPLITMV,   ALTREF_FRAME, NONE},
-  {RD_SPLITMV,   LAST_FRAME,   ALTREF_FRAME},
-  {RD_SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
-
-  {RD_ZEROMV,    LAST_FRAME,   NONE},
-  {RD_ZEROMV,    GOLDEN_FRAME, NONE},
-  {RD_ZEROMV,    ALTREF_FRAME, NONE},
-  {RD_ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
-  {RD_ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
-
-  {RD_I4X4_PRED, INTRA_FRAME,  NONE},
-  {RD_H_PRED,    INTRA_FRAME,  NONE},
-  {RD_V_PRED,    INTRA_FRAME,  NONE},
-  {RD_D135_PRED, INTRA_FRAME,  NONE},
-  {RD_D207_PRED, INTRA_FRAME,  NONE},
-  {RD_D153_PRED, INTRA_FRAME,  NONE},
-  {RD_D63_PRED,  INTRA_FRAME,  NONE},
-  {RD_D117_PRED, INTRA_FRAME,  NONE},
-  {RD_D45_PRED,  INTRA_FRAME,  NONE},
+  {NEARESTMV, LAST_FRAME,   NONE},
+  {NEARESTMV, ALTREF_FRAME, NONE},
+  {NEARESTMV, GOLDEN_FRAME, NONE},
+
+  {DC_PRED,   INTRA_FRAME,  NONE},
+
+  {NEWMV,     LAST_FRAME,   NONE},
+  {NEWMV,     ALTREF_FRAME, NONE},
+  {NEWMV,     GOLDEN_FRAME, NONE},
+
+  {NEARMV,    LAST_FRAME,   NONE},
+  {NEARMV,    ALTREF_FRAME, NONE},
+  {NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
+  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
+
+  {TM_PRED,   INTRA_FRAME,  NONE},
+
+  {NEARMV,    LAST_FRAME,   ALTREF_FRAME},
+  {NEWMV,     LAST_FRAME,   ALTREF_FRAME},
+  {NEARMV,    GOLDEN_FRAME, NONE},
+  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
+  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
+
+  {ZEROMV,    LAST_FRAME,   NONE},
+  {ZEROMV,    GOLDEN_FRAME, NONE},
+  {ZEROMV,    ALTREF_FRAME, NONE},
+  {ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
+  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
+
+  {H_PRED,    INTRA_FRAME,  NONE},
+  {V_PRED,    INTRA_FRAME,  NONE},
+  {D135_PRED, INTRA_FRAME,  NONE},
+  {D207_PRED, INTRA_FRAME,  NONE},
+  {D153_PRED, INTRA_FRAME,  NONE},
+  {D63_PRED,  INTRA_FRAME,  NONE},
+  {D117_PRED, INTRA_FRAME,  NONE},
+  {D45_PRED,  INTRA_FRAME,  NONE},
+};
+
+const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
+  {LAST_FRAME,   NONE},
+  {GOLDEN_FRAME, NONE},
+  {ALTREF_FRAME, NONE},
+  {LAST_FRAME,   ALTREF_FRAME},
+  {GOLDEN_FRAME, ALTREF_FRAME},
+  {INTRA_FRAME,  NONE},
 };
 
 // The baseline rd thresholds for breaking out of the rd loop for
@@ -106,8 +107,13 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
 static int rd_thresh_block_size_factor[BLOCK_SIZES] =
   {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
 
-#define MAX_RD_THRESH_FACT 64
-#define RD_THRESH_INC 1
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC      1
+#define RD_THRESH_POW      1.25
+#define RD_MULT_EPB_RATIO  64
+
+#define MV_COST_WEIGHT      108
+#define MV_COST_WEIGHT_SUB  120
 
 static void fill_token_costs(vp9_coeff_cost *c,
                              vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
@@ -155,18 +161,26 @@ void vp9_init_me_luts() {
   }
 }
 
-static int compute_rd_mult(int qindex) {
+int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex) {
   const int q = vp9_dc_quant(qindex, 0);
-  return (11 * q * q) >> 2;
+  // TODO(debargha): Adjust the function below
+  int rdmult = 88 * q * q / 25;
+  if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+    if (cpi->twopass.next_iiratio > 31)
+      rdmult += (rdmult * rd_iifactor[31]) >> 4;
+    else
+      rdmult += (rdmult * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
+  }
+  return rdmult;
 }
 
-static MB_PREDICTION_MODE rd_mode_to_mode(RD_PREDICTION_MODE rd_mode) {
-  if (rd_mode == RD_SPLITMV || rd_mode == RD_I4X4_PRED) {
-    assert(!"Invalid rd_mode");
-    return MB_MODE_COUNT;
-  }
-  assert((int)rd_mode < (int)MB_MODE_COUNT);
-  return (MB_PREDICTION_MODE)rd_mode;
+static int compute_rd_thresh_factor(int qindex) {
+  int q;
+  // TODO(debargha): Adjust the function below
+  q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
+  if (q < 8)
+    q = 8;
+  return q;
 }
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
@@ -174,102 +188,90 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
   cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
 }
 
+static void set_block_thresholds(VP9_COMP *cpi) {
+  int i, bsize, segment_id;
+  VP9_COMMON *cm = &cpi->common;
 
-void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
-  int q, i, bsize;
-
-  vp9_clear_system_state();  // __asm emms;
-
-  // Further tests required to see if optimum is different
-  // for key frames, golden frames and arf frames.
-  // if (cpi->common.refresh_golden_frame ||
-  //     cpi->common.refresh_alt_ref_frame)
-  qindex = clamp(qindex, 0, MAXQ);
-
-  cpi->RDMULT = compute_rd_mult(qindex);
-  if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
-    if (cpi->twopass.next_iiratio > 31)
-      cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
-    else
-      cpi->RDMULT +=
-          (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
-  }
-  cpi->mb.errorperbit = cpi->RDMULT >> 6;
-  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
-
-  vp9_set_speed_features(cpi);
-
-  q = (int)pow(vp9_dc_quant(qindex, 0) >> 2, 1.25);
-  q <<= 2;
-  if (q < 8)
-    q = 8;
-
-  if (cpi->RDMULT > 1000) {
-    cpi->RDDIV = 1;
-    cpi->RDMULT /= 100;
+  for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
+    int q;
+    int segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+    segment_qindex = clamp(segment_qindex + cm->y_dc_delta_q, 0, MAXQ);
+    q = compute_rd_thresh_factor(segment_qindex);
 
     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
+      // Threshold here seem unecessarily harsh but fine given actual
+      // range of values used for cpi->sf.thresh_mult[]
+      int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
+
       for (i = 0; i < MAX_MODES; ++i) {
-        // Threshold here seem unecessarily harsh but fine given actual
-        // range of values used for cpi->sf.thresh_mult[]
-        int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
-
-        // *4 relates to the scaling of rd_thresh_block_size_factor[]
-        if ((int64_t)cpi->sf.thresh_mult[i] < thresh_max) {
-          cpi->rd_threshes[bsize][i] =
-            cpi->sf.thresh_mult[i] * q *
-            rd_thresh_block_size_factor[bsize] / (4 * 100);
+        if (cpi->sf.thresh_mult[i] < thresh_max) {
+          cpi->rd_threshes[segment_id][bsize][i] =
+              cpi->sf.thresh_mult[i] * q *
+              rd_thresh_block_size_factor[bsize] / 4;
         } else {
-          cpi->rd_threshes[bsize][i] = INT_MAX;
+          cpi->rd_threshes[segment_id][bsize][i] = INT_MAX;
         }
       }
-    }
-  } else {
-    cpi->RDDIV = 100;
-
-    for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
-      for (i = 0; i < MAX_MODES; i++) {
-        // Threshold here seem unecessarily harsh but fine given actual
-        // range of values used for cpi->sf.thresh_mult[]
-        int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
 
-        if (cpi->sf.thresh_mult[i] < thresh_max) {
-          cpi->rd_threshes[bsize][i] =
-            cpi->sf.thresh_mult[i] * q *
-            rd_thresh_block_size_factor[bsize] / 4;
+      for (i = 0; i < MAX_REFS; ++i) {
+        if (cpi->sf.thresh_mult_sub8x8[i] < thresh_max) {
+          cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
+              cpi->sf.thresh_mult_sub8x8[i] * q *
+              rd_thresh_block_size_factor[bsize] / 4;
         } else {
-          cpi->rd_threshes[bsize][i] = INT_MAX;
+          cpi->rd_thresh_sub8x8[segment_id][bsize][i] = INT_MAX;
         }
       }
     }
   }
+}
+
+void vp9_initialize_rd_consts(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int qindex, i;
+
+  vp9_clear_system_state();  // __asm emms;
+
+  // Further tests required to see if optimum is different
+  // for key frames, golden frames and arf frames.
+  // if (cpi->common.refresh_golden_frame ||
+  //     cpi->common.refresh_alt_ref_frame)
+  qindex = clamp(cm->base_qindex + cm->y_dc_delta_q, 0, MAXQ);
+
+  cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
+  cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex);
+
+  cpi->mb.errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
+  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
+
+  vp9_set_speed_features(cpi);
 
-  fill_token_costs(cpi->mb.token_costs, cpi->common.fc.coef_probs);
+  set_block_thresholds(cpi);
 
-  for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
-    vp9_cost_tokens(cpi->mb.partition_cost[i],
-                    cpi->common.fc.partition_prob[cpi->common.frame_type][i],
+  fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs);
+
+  for (i = 0; i < PARTITION_CONTEXTS; i++)
+    vp9_cost_tokens(cpi->mb.partition_cost[i], get_partition_probs(cm, i),
                     vp9_partition_tree);
 
   /*rough estimate for costing*/
   vp9_init_mode_costs(cpi);
 
-  if (cpi->common.frame_type != KEY_FRAME) {
+  if (!frame_is_intra_only(cm)) {
     vp9_build_nmv_cost_table(
         cpi->mb.nmvjointcost,
-        cpi->mb.e_mbd.allow_high_precision_mv ?
-        cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
-        &cpi->common.fc.nmvc,
-        cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);
+        cm->allow_high_precision_mv ? cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
+        &cm->fc.nmvc,
+        cm->allow_high_precision_mv, 1, 1);
 
     for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
       MB_PREDICTION_MODE m;
 
       for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
-        cpi->mb.inter_mode_cost[i][m - NEARESTMV] =
+        cpi->mb.inter_mode_cost[i][inter_mode_offset(m)] =
             cost_token(vp9_inter_mode_tree,
-                       cpi->common.fc.inter_mode_probs[i],
-                       vp9_inter_mode_encodings - NEARESTMV + m);
+                       cm->fc.inter_mode_probs[i],
+                       &vp9_inter_mode_encodings[inter_mode_offset(m)]);
     }
   }
 }
@@ -369,8 +371,8 @@ static void model_rd_from_var_lapndz(int var, int n, int qstep,
     double s2 = (double) var / n;
     double x = qstep / sqrt(s2);
     model_rd_norm(x, &R, &D);
-    *rate = ((n << 8) * R + 0.5);
-    *dist = (var * D + 0.5);
+    *rate = (int)((n << 8) * R + 0.5);
+    *dist = (int)(var * D + 0.5);
   }
   vp9_clear_system_state();
 }
@@ -397,7 +399,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
                              pd->dequant[1] >> 3, &rate, &dist);
 
     rate_sum += rate;
-    dist_sum += dist;
+    dist_sum += (int)dist;
   }
 
   *out_rate_sum = rate_sum;
@@ -479,13 +481,13 @@ static const int16_t band_counts[TX_SIZES][8] = {
   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 };
 
-static INLINE int cost_coeffs(MACROBLOCK *mb,
+static INLINE int cost_coeffs(MACROBLOCK *x,
                               int plane, int block,
                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
                               TX_SIZE tx_size,
                               const int16_t *scan, const int16_t *nb) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   struct macroblockd_plane *pd = &xd->plane[plane];
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *band_count = &band_counts[tx_size][1];
@@ -493,9 +495,9 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
   const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
   unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
-                   mb->token_costs[tx_size][type][ref];
+                   x->token_costs[tx_size][type][ref];
   const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
-  uint8_t token_cache[1024];
+  uint8_t *p_tok = x->token_cache;
   int pt = combine_entropy_contexts(above_ec, left_ec);
   int c, cost;
 
@@ -514,7 +516,7 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
     int v = qcoeff_ptr[0];
     int prev_t = vp9_dct_value_tokens_ptr[v].token;
     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
-    token_cache[0] = vp9_pt_energy_class[prev_t];
+    p_tok[0] = vp9_pt_energy_class[prev_t];
     ++token_costs;
 
     // ac tokens
@@ -524,9 +526,9 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
 
       v = qcoeff_ptr[rc];
       t = vp9_dct_value_tokens_ptr[v].token;
-      pt = get_coef_context(nb, token_cache, c);
+      pt = get_coef_context(nb, p_tok, c);
       cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
-      token_cache[rc] = vp9_pt_energy_class[t];
+      p_tok[rc] = vp9_pt_energy_class[t];
       prev_t = t;
       if (!--band_left) {
         band_left = *band_count++;
@@ -536,7 +538,7 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
 
     // eob token
     if (band_left) {
-      pt = get_coef_context(nb, token_cache, c);
+      pt = get_coef_context(nb, p_tok, c);
       cost += (*token_costs)[0][pt][DCT_EOB_TOKEN];
     }
   }
@@ -547,21 +549,6 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
   return cost;
 }
 
-struct rdcost_block_args {
-  MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[16];
-  ENTROPY_CONTEXT t_left[16];
-  TX_SIZE tx_size;
-  int bw;
-  int bh;
-  int rate;
-  int64_t dist;
-  int64_t sse;
-  int64_t best_rd;
-  int skip;
-  const int16_t *scan, *nb;
-};
-
 static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
   const int ss_txfrm_size = tx_size << 1;
   struct rdcost_block_args* args = arg;
@@ -573,16 +560,16 @@ static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
   int shift = args->tx_size == TX_32X32 ? 0 : 2;
   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                                &this_sse) >> shift;
-  args->sse += this_sse >> shift;
+  args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                               &this_sse) >> shift;
+  args->sse  = this_sse >> shift;
 
   if (x->skip_encode &&
-      xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
+      xd->mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME) {
     // TODO(jingning): tune the model to better capture the distortion.
     int64_t p = (pd->dequant[1] * pd->dequant[1] *
-                    (1 << ss_txfrm_size)) >> shift;
-    args->dist += p;
+                    (1 << ss_txfrm_size)) >> (shift + 2);
+    args->dist += (p >> 4);
     args->sse  += p;
   }
 }
@@ -594,10 +581,9 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
   int x_idx, y_idx;
   txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx);
 
-  args->rate += cost_coeffs(args->x, plane, block,
-                            args->t_above + x_idx,
-                            args->t_left + y_idx, args->tx_size,
-                            args->scan, args->nb);
+  args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
+                           args->t_left + y_idx, args->tx_size,
+                           args->scan, args->nb);
 }
 
 static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -610,85 +596,114 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
 
   if (args->skip)
     return;
-  rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
-  rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
-  rd = MIN(rd1, rd2);
-  if (rd > args->best_rd) {
-    args->skip = 1;
-    args->rate = INT_MAX;
-    args->dist = INT64_MAX;
-    args->sse  = INT64_MAX;
-    return;
-  }
 
-  if (!is_inter_block(&xd->this_mi->mbmi))
+  if (!is_inter_block(&xd->mi_8x8[0]->mbmi))
     vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args);
   else
     vp9_xform_quant(plane, block, plane_bsize, tx_size, &encode_args);
 
   dist_block(plane, block, tx_size, args);
   rate_block(plane, block, plane_bsize, tx_size, args);
-}
+  rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
+  rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
 
-static void txfm_rd_in_plane(MACROBLOCK *x,
-                             int *rate, int64_t *distortion,
-                             int *skippable, int64_t *sse,
-                             int64_t ref_best_rd, int plane,
-                             BLOCK_SIZE bsize, TX_SIZE tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs];
-  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs];
-  int i;
-  struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size,
-                                    num_4x4_blocks_wide, num_4x4_blocks_high,
-                                    0, 0, 0, ref_best_rd, 0 };
+  // TODO(jingning): temporarily enabled only for luma component
+  rd = MIN(rd1, rd2);
   if (plane == 0)
-    xd->this_mi->mbmi.tx_size = tx_size;
+    x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block];
+
+  args->this_rate += args->rate;
+  args->this_dist += args->dist;
+  args->this_sse  += args->sse;
+  args->this_rd += rd;
 
+  if (args->this_rd > args->best_rd) {
+    args->skip = 1;
+    return;
+  }
+}
+
+void vp9_get_entropy_contexts(TX_SIZE tx_size,
+    ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
+    const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
+    int num_4x4_w, int num_4x4_h) {
+  int i;
   switch (tx_size) {
     case TX_4X4:
-      vpx_memcpy(&args.t_above, pd->above_context,
-                 sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide);
-      vpx_memcpy(&args.t_left, pd->left_context,
-                 sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high);
-      get_scan_nb_4x4(get_tx_type_4x4(pd->plane_type, xd, 0),
-                      &args.scan, &args.nb);
+      vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+      vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
       break;
     case TX_8X8:
-      for (i = 0; i < num_4x4_blocks_wide; i += 2)
-        args.t_above[i] = !!*(uint16_t *)&pd->above_context[i];
-      for (i = 0; i < num_4x4_blocks_high; i += 2)
-        args.t_left[i] = !!*(uint16_t *)&pd->left_context[i];
-      get_scan_nb_8x8(get_tx_type_8x8(pd->plane_type, xd),
-                      &args.scan, &args.nb);
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
       break;
     case TX_16X16:
-      for (i = 0; i < num_4x4_blocks_wide; i += 4)
-        args.t_above[i] = !!*(uint32_t *)&pd->above_context[i];
-      for (i = 0; i < num_4x4_blocks_high; i += 4)
-        args.t_left[i] = !!*(uint32_t *)&pd->left_context[i];
-      get_scan_nb_16x16(get_tx_type_16x16(pd->plane_type, xd),
-                        &args.scan, &args.nb);
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
       break;
     case TX_32X32:
-      for (i = 0; i < num_4x4_blocks_wide; i += 8)
-        args.t_above[i] = !!*(uint64_t *)&pd->above_context[i];
-      for (i = 0; i < num_4x4_blocks_high; i += 8)
-        args.t_left[i] = !!*(uint64_t *)&pd->left_context[i];
-      args.scan = vp9_default_scan_32x32;
-      args.nb = vp9_default_scan_32x32_neighbors;
+      for (i = 0; i < num_4x4_w; i += 8)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 8)
+        t_left[i] = !!*(const uint64_t *)&left[i];
       break;
     default:
-      assert(0);
+      assert(!"Invalid transform size.");
   }
+}
+
+static void init_rdcost_stack(MACROBLOCK *x, TX_SIZE tx_size,
+                              const int num_4x4_w, const int num_4x4_h,
+                              const int64_t ref_rdcost,
+                              struct rdcost_block_args *arg) {
+  vpx_memset(arg, 0, sizeof(struct rdcost_block_args));
+  arg->x = x;
+  arg->tx_size = tx_size;
+  arg->bw = num_4x4_w;
+  arg->bh = num_4x4_h;
+  arg->best_rd = ref_rdcost;
+}
+
+static void txfm_rd_in_plane(MACROBLOCK *x,
+                             struct rdcost_block_args *rd_stack,
+                             int *rate, int64_t *distortion,
+                             int *skippable, int64_t *sse,
+                             int64_t ref_best_rd, int plane,
+                             BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[bs];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
+
+  init_rdcost_stack(x, tx_size, num_4x4_w, num_4x4_h,
+                    ref_best_rd, rd_stack);
+  if (plane == 0)
+    xd->mi_8x8[0]->mbmi.tx_size = tx_size;
 
-  foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args);
-  *distortion = args.dist;
-  *rate       = args.rate;
-  *sse        = args.sse;
-  *skippable  = vp9_is_skippable_in_plane(xd, bsize, plane) && (!args.skip);
+  vp9_get_entropy_contexts(tx_size, rd_stack->t_above, rd_stack->t_left,
+                           pd->above_context, pd->left_context,
+                           num_4x4_w, num_4x4_h);
+
+  get_scan(xd, tx_size, pd->plane_type, 0, &rd_stack->scan, &rd_stack->nb);
+
+  foreach_transformed_block_in_plane(xd, bsize, plane,
+                                     block_yrd_txfm, rd_stack);
+  if (rd_stack->skip) {
+    *rate       = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse        = INT64_MAX;
+    *skippable  = 0;
+  } else {
+    *distortion = rd_stack->this_dist;
+    *rate       = rd_stack->this_rate;
+    *sse        = rd_stack->this_sse;
+    *skippable  = vp9_is_skippable_in_plane(xd, bsize, plane);
+  }
 }
 
 static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
@@ -696,28 +711,18 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
                                      int *skip, int64_t *sse,
                                      int64_t ref_best_rd,
                                      BLOCK_SIZE bs) {
-  const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
+  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
-  if (max_txfm_size == TX_32X32 &&
-      (cm->tx_mode == ALLOW_32X32 ||
-       cm->tx_mode == TX_MODE_SELECT)) {
-    mbmi->tx_size = TX_32X32;
-  } else if (max_txfm_size >= TX_16X16 &&
-             (cm->tx_mode == ALLOW_16X16 ||
-              cm->tx_mode == ALLOW_32X32 ||
-              cm->tx_mode == TX_MODE_SELECT)) {
-    mbmi->tx_size = TX_16X16;
-  } else if (cm->tx_mode != ONLY_4X4) {
-    mbmi->tx_size = TX_8X8;
-  } else {
-    mbmi->tx_size = TX_4X4;
-  }
-  txfm_rd_in_plane(x, rate, distortion, skip,
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+
+  mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
+
+  txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
                    &sse[mbmi->tx_size], ref_best_rd, 0, bs,
                    mbmi->tx_size);
-  cpi->txfm_stepdown_count[0]++;
+  cpi->tx_stepdown_count[0]++;
 }
 
 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
@@ -729,13 +734,13 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
   int64_t rd[TX_SIZES][2];
   int n, m;
   int s0, s1;
 
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->this_mi);
+  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]);
 
   for (n = TX_4X4; n <= max_tx_size; n++) {
     r[n][1] = r[n][0];
@@ -811,15 +816,15 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
       rd[TX_32X32][1] < rd[TX_16X16][1] &&
       rd[TX_32X32][1] < rd[TX_8X8][1] &&
       rd[TX_32X32][1] < rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[0]++;
+    cpi->tx_stepdown_count[0]++;
   } else if (max_tx_size >= TX_16X16 &&
              rd[TX_16X16][1] < rd[TX_8X8][1] &&
              rd[TX_16X16][1] < rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[max_tx_size - TX_16X16]++;
+    cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[max_tx_size - TX_8X8]++;
+    cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
   } else {
-    cpi->txfm_stepdown_count[max_tx_size - TX_4X4]++;
+    cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
   }
 }
 
@@ -829,10 +834,10 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
                                           int *s, int *skip, int64_t *sse,
                                           int64_t ref_best_rd,
                                           BLOCK_SIZE bs) {
-  const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
   int64_t rd[TX_SIZES][2];
   int n, m;
@@ -840,14 +845,14 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
   // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
 
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs,  xd->this_mi);
+  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]);
 
   // for (n = TX_4X4; n <= max_txfm_size; n++)
   //   r[n][0] = (r[n][0] * scale_r[n]);
 
-  for (n = TX_4X4; n <= max_txfm_size; n++) {
+  for (n = TX_4X4; n <= max_tx_size; n++) {
     r[n][1] = r[n][0];
-    for (m = 0; m <= n - (n == max_txfm_size); m++) {
+    for (m = 0; m <= n - (n == max_tx_size); m++) {
       if (m == n)
         r[n][1] += vp9_cost_zero(tx_probs[m]);
       else
@@ -859,7 +864,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
 
-  for (n = TX_4X4; n <= max_txfm_size; n++) {
+  for (n = TX_4X4; n <= max_tx_size; n++) {
     if (s[n]) {
       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
     } else {
@@ -867,19 +872,19 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
   }
-  for (n = TX_4X4; n <= max_txfm_size; n++) {
-    rd[n][0] = (scale_rd[n] * rd[n][0]);
-    rd[n][1] = (scale_rd[n] * rd[n][1]);
+  for (n = TX_4X4; n <= max_tx_size; n++) {
+    rd[n][0] = (int64_t)(scale_rd[n] * rd[n][0]);
+    rd[n][1] = (int64_t)(scale_rd[n] * rd[n][1]);
   }
 
-  if (max_txfm_size == TX_32X32 &&
+  if (max_tx_size == TX_32X32 &&
       (cm->tx_mode == ALLOW_32X32 ||
        (cm->tx_mode == TX_MODE_SELECT &&
         rd[TX_32X32][1] <= rd[TX_16X16][1] &&
         rd[TX_32X32][1] <= rd[TX_8X8][1] &&
         rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
     mbmi->tx_size = TX_32X32;
-  } else if (max_txfm_size >= TX_16X16 &&
+  } else if (max_tx_size >= TX_16X16 &&
              (cm->tx_mode == ALLOW_16X16 ||
               cm->tx_mode == ALLOW_32X32 ||
               (cm->tx_mode == TX_MODE_SELECT &&
@@ -898,22 +903,22 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
 
   // Actually encode using the chosen mode if a model was used, but do not
   // update the r, d costs
-  txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size],
-                   ref_best_rd, 0, bs, mbmi->tx_size);
+  txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
+                   &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
 
-  if (max_txfm_size == TX_32X32 &&
+  if (max_tx_size == TX_32X32 &&
       rd[TX_32X32][1] <= rd[TX_16X16][1] &&
       rd[TX_32X32][1] <= rd[TX_8X8][1] &&
       rd[TX_32X32][1] <= rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[0]++;
-  } else if (max_txfm_size >= TX_16X16 &&
+    cpi->tx_stepdown_count[0]++;
+  } else if (max_tx_size >= TX_16X16 &&
              rd[TX_16X16][1] <= rd[TX_8X8][1] &&
              rd[TX_16X16][1] <= rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
+    cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
+    cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
   } else {
-    cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
+    cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
   }
 }
 
@@ -925,15 +930,17 @@ static void super_block_yrd(VP9_COMP *cpi,
   int r[TX_SIZES][2], s[TX_SIZES];
   int64_t d[TX_SIZES], sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
+  const int b_inter_mode = is_inter_block(mbmi);
 
   assert(bs == mbmi->sb_type);
-  if (mbmi->ref_frame[0] > INTRA_FRAME)
+  if (b_inter_mode)
     vp9_subtract_sby(x, bs);
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
       (cpi->sf.tx_size_search_method != USE_FULL_RD &&
-       mbmi->ref_frame[0] == INTRA_FRAME)) {
+       !b_inter_mode)) {
     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
                              ref_best_rd, bs);
@@ -943,7 +950,7 @@ static void super_block_yrd(VP9_COMP *cpi,
   }
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
-      mbmi->ref_frame[0] > INTRA_FRAME) {
+      b_inter_mode) {
     if (bs >= BLOCK_32X32)
       model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
                            &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
@@ -961,14 +968,16 @@ static void super_block_yrd(VP9_COMP *cpi,
                                   skip, sse, ref_best_rd, bs);
   } else {
     if (bs >= BLOCK_32X32)
-      txfm_rd_in_plane(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
-                       &sse[TX_32X32], ref_best_rd, 0, bs, TX_32X32);
+      txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32],
+                       &s[TX_32X32], &sse[TX_32X32],
+                       ref_best_rd, 0, bs, TX_32X32);
     if (bs >= BLOCK_16X16)
-      txfm_rd_in_plane(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
-                       &sse[TX_16X16], ref_best_rd, 0, bs, TX_16X16);
-    txfm_rd_in_plane(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+      txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16],
+                       &s[TX_16X16], &sse[TX_16X16],
+                       ref_best_rd, 0, bs, TX_16X16);
+    txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
                      &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
-    txfm_rd_in_plane(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+    txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
                      &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
                              skip, txfm_cache, bs);
@@ -1022,23 +1031,23 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
-  TX_TYPE tx_type = DCT_DCT;
+
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-  int idx, idy, block;
+  int idx, idy;
   uint8_t best_dst[8 * 8];
 
   assert(ib < 4);
 
   vpx_memcpy(ta, a, sizeof(ta));
   vpx_memcpy(tl, l, sizeof(tl));
-  xd->this_mi->mbmi.tx_size = TX_4X4;
+  xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
 
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
     int ratey = 0;
 
-    if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+    if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
       continue;
 
     // Only do the oblique modes if the best so far is
@@ -1058,11 +1067,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
         int64_t ssz;
         const int16_t *scan;
+        const int16_t *nb;
         uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
         uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
-
-        block = ib + idy * 2 + idx;
-        xd->this_mi->bmi[block].as_mode = mode;
+        const int block = ib + idy * 2 + idx;
+        TX_TYPE tx_type;
+        xd->mi_8x8[0]->bmi[block].as_mode = mode;
         src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
         coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         vp9_predict_intra_block(xd, block, 1,
@@ -1075,29 +1085,28 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                            dst, dst_stride);
 
         tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
-        if (tx_type != DCT_DCT) {
+        get_scan_nb_4x4(tx_type, &scan, &nb);
+
+        if (tx_type != DCT_DCT)
           vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
-          x->quantize_b_4x4(x, block, tx_type, 16);
-        } else {
-          x->fwd_txm4x4(src_diff, coeff, 16);
-          x->quantize_b_4x4(x, block, tx_type, 16);
-        }
+        else
+          x->fwd_txm4x4(src_diff, coeff, 8);
+
+        vp9_regular_quantize_b_4x4(x, 4, block, scan, get_iscan_4x4(tx_type));
 
-        scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block));
         ratey += cost_coeffs(x, 0, block,
-                             tempa + idx, templ + idy, TX_4X4, scan,
-                             vp9_get_coef_neighbors_handle(scan));
+                             tempa + idx, templ + idy, TX_4X4, scan, nb);
         distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                       16, &ssz) >> 2;
         if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
           goto next;
 
         if (tx_type != DCT_DCT)
-          vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
+          vp9_iht4x4_16_add(BLOCK_OFFSET(pd->dqcoeff, block),
                                dst, pd->dst.stride, tx_type);
         else
-          xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
-                             dst, pd->dst.stride);
+          xd->itxm_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride,
+                       16);
       }
     }
 
@@ -1138,10 +1147,10 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
                                             int64_t best_rd) {
   int i, j;
   MACROBLOCKD *const xd = &mb->e_mbd;
-  MODE_INFO *const mic = xd->this_mi;
+  MODE_INFO *const mic = xd->mi_8x8[0];
   const MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO *left_mi = xd->mi_8x8[-1];
-  const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type;
+  const MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL;
+  const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
@@ -1166,9 +1175,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
       i = idy * 2 + idx;
       if (cpi->common.frame_type == KEY_FRAME) {
         const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, i);
-        const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
-                                     left_block_mode(mic, left_mi, i) :
-                                     DC_PRED;
+        const MB_PREDICTION_MODE L = left_block_mode(mic, left_mi, i);
 
         bmode_costs  = mb->y_mode_costs[A][L];
       }
@@ -1212,7 +1219,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE mode_selected = DC_PRED;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->this_mi;
+  MODE_INFO *const mic = xd->mi_8x8[0];
   int this_rate, this_rate_tokenonly, s;
   int64_t this_distortion, this_rd;
   TX_SIZE best_tx = TX_4X4;
@@ -1227,15 +1234,14 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int64_t local_tx_cache[TX_MODES];
     MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
-    MODE_INFO *left_mi = xd->mi_8x8[-1];
+    MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL;
 
-    if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+    if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode)))
       continue;
 
     if (cpi->common.frame_type == KEY_FRAME) {
       const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, 0);
-      const MB_PREDICTION_MODE L = xd->left_available ?
-                                   left_block_mode(mic, left_mi, 0) : DC_PRED;
+      const MB_PREDICTION_MODE L = left_block_mode(mic, left_mi, 0);
 
       bmode_costs = x->y_mode_costs[A][L];
     }
@@ -1277,12 +1283,12 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   return best_rd;
 }
 
-static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
+static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
                              int *rate, int64_t *distortion, int *skippable,
                              int64_t *sse, BLOCK_SIZE bsize,
                              int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
   int plane;
   int pnrate = 0, pnskip = 1;
@@ -1300,7 +1306,7 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
   *skippable = 1;
 
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
+    txfm_rd_in_plane(x, &cpi->rdcost_stack, &pnrate, &pndist, &pnskip, &pnsse,
                      ref_best_rd, plane, bsize, uv_txfm_size);
     if (pnrate == INT_MAX)
       goto term;
@@ -1332,14 +1338,15 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // int mode_mask = (bsize <= BLOCK_8X8)
   //                ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
 
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+  for (mode = DC_PRED; mode <= TM_PRED; mode ++) {
     // if (!(mode_mask & (1 << mode)))
-    if (!(cpi->sf.intra_uv_mode_mask & (1 << mode)))
+    if (!(cpi->sf.intra_uv_mode_mask[max_uv_txsize_lookup[bsize]]
+          & (1 << mode)))
       continue;
 
     x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
 
-    super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
+    super_block_uvrd(cpi, x, &this_rate_tokenonly,
                      &this_distortion, &s, &this_sse, bsize, best_rd);
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1370,8 +1377,8 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t this_sse;
 
   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
-  super_block_uvrd(&cpi->common, x, rate_tokenonly,
-                   distortion, skippable, &this_sse, bsize, INT64_MAX);
+  super_block_uvrd(cpi, x, rate_tokenonly, distortion,
+                   skippable, &this_sse, bsize, INT64_MAX);
   *rate = *rate_tokenonly +
           x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
   this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
@@ -1404,12 +1411,12 @@ static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
                        int mode_context) {
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int segment_id = xd->this_mi->mbmi.segment_id;
+  const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
 
   // Don't account for mode here if segment skip is enabled.
   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
     assert(is_inter_mode(mode));
-    return x->inter_mode_cost[mode_context][mode - NEARESTMV];
+    return x->inter_mode_cost[mode_context][inter_mode_offset(mode)];
   } else {
     return 0;
   }
@@ -1426,10 +1433,6 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                 int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
                                 int *rate_mv);
-static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                 BLOCK_SIZE bsize,
-                                 int mi_row, int mi_col,
-                                 int_mv *tmp_mv, int *rate_mv);
 
 static int labels2mode(MACROBLOCK *x, int i,
                        MB_PREDICTION_MODE this_mode,
@@ -1440,12 +1443,13 @@ static int labels2mode(MACROBLOCK *x, int i,
                        int_mv *second_best_ref_mv,
                        int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->this_mi;
+  MODE_INFO *const mic = xd->mi_8x8[0];
   MB_MODE_INFO *mbmi = &mic->mbmi;
   int cost = 0, thismvcost = 0;
   int idx, idy;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  const int has_second_rf = has_second_ref(mbmi);
 
   /* We have to be careful retrieving previously-encoded motion vectors.
    Ones from this macroblock have to be pulled from the BLOCKD array
@@ -1457,29 +1461,30 @@ static int labels2mode(MACROBLOCK *x, int i,
   switch (m = this_mode) {
     case NEWMV:
       this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
-      thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
-                                    102);
-      if (mbmi->ref_frame[1] > 0) {
+      thismvcost  = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
+                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      if (has_second_rf) {
         this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
-        thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
-                                      mvjcost, mvcost, 102);
+        thismvcost += vp9_mv_bit_cost(&this_second_mv->as_mv,
+                                      &second_best_ref_mv->as_mv,
+                                      mvjcost, mvcost, MV_COST_WEIGHT_SUB);
       }
       break;
     case NEARESTMV:
       this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;
-      if (mbmi->ref_frame[1] > 0)
+      if (has_second_rf)
         this_second_mv->as_int =
             frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
       break;
     case NEARMV:
       this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;
-      if (mbmi->ref_frame[1] > 0)
+      if (has_second_rf)
         this_second_mv->as_int =
             frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
       break;
     case ZEROMV:
       this_mv->as_int = 0;
-      if (mbmi->ref_frame[1] > 0)
+      if (has_second_rf)
         this_second_mv->as_int = 0;
       break;
     default:
@@ -1490,10 +1495,11 @@ static int labels2mode(MACROBLOCK *x, int i,
                      mbmi->mode_context[mbmi->ref_frame[0]]);
 
   mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
-  if (mbmi->ref_frame[1] > 0)
+  if (has_second_rf)
     mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
 
-  x->partition_info->bmi[i].mode = m;
+  mic->bmi[i].as_mode = m;
+
   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
@@ -1514,25 +1520,21 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   int k;
   MACROBLOCKD *xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
-  MODE_INFO *const mi = xd->this_mi;
+  struct macroblock_plane *const p = &x->plane[0];
+  MODE_INFO *const mi = xd->mi_8x8[0];
   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
   const int width = plane_block_width(bsize, pd);
   const int height = plane_block_height(bsize, pd);
   int idx, idy;
-  const int src_stride = x->plane[0].src.stride;
-  uint8_t* const src = raster_block_offset_uint8(BLOCK_8X8, i,
-                                                 x->plane[0].src.buf,
-                                                 src_stride);
-  int16_t* src_diff = raster_block_offset_int16(BLOCK_8X8, i,
-                                                x->plane[0].src_diff);
-  int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, i);
-  uint8_t* const dst = raster_block_offset_uint8(BLOCK_8X8, i,
+
+  uint8_t *const src = raster_block_offset_uint8(BLOCK_8X8, i,
+                                                 p->src.buf, p->src.stride);
+  uint8_t *const dst = raster_block_offset_uint8(BLOCK_8X8, i,
                                                  pd->dst.buf, pd->dst.stride);
   int64_t thisdistortion = 0, thissse = 0;
-  int thisrate = 0;
-  int ref, second_ref = has_second_ref(&mi->mbmi);
-
-  for (ref = 0; ref < 1 + second_ref; ++ref) {
+  int thisrate = 0, ref;
+  const int is_compound = has_second_ref(&mi->mbmi);
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
     const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i,
                                      pd->pre[ref].buf, pd->pre[ref].stride);
     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
@@ -1542,20 +1544,23 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                               width, height, ref, &xd->subpix, MV_PRECISION_Q3);
   }
 
-  vp9_subtract_block(height, width, src_diff, 8, src, src_stride,
+  vp9_subtract_block(height, width,
+                     raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+                     src, p->src.stride,
                      dst, pd->dst.stride);
 
   k = i;
   for (idy = 0; idy < height / 4; ++idy) {
     for (idx = 0; idx < width / 4; ++idx) {
       int64_t ssz, rd, rd1, rd2;
+      int16_t* coeff;
 
       k += (idy * 2 + idx);
-      src_diff = raster_block_offset_int16(BLOCK_8X8, k,
-                                           x->plane[0].src_diff);
-      coeff = BLOCK_OFFSET(x->plane[0].coeff, k);
-      x->fwd_txm4x4(src_diff, coeff, 16);
-      x->quantize_b_4x4(x, k, DCT_DCT, 16);
+      coeff = BLOCK_OFFSET(p->coeff, k);
+      x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+                    coeff, 8);
+      vp9_regular_quantize_b_4x4(x, 4, k, get_scan_4x4(DCT_DCT),
+                                 get_iscan_4x4(DCT_DCT));
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
       thissse += ssz;
@@ -1571,6 +1576,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
         return INT64_MAX;
     }
   }
+
   *distortion = thisdistortion >> 2;
   *labelyrate = thisrate;
   *sse = thissse >> 2;
@@ -1623,7 +1629,7 @@ static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
   pd->pre[0].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[0].buf,
                                              pd->pre[0].stride);
-  if (mbmi->ref_frame[1])
+  if (has_second_ref(mbmi))
     pd->pre[1].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[1].buf,
                                                pd->pre[1].stride);
 }
@@ -1633,19 +1639,21 @@ static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
   MB_MODE_INFO *mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
   x->plane[0].src = orig_src;
   x->e_mbd.plane[0].pre[0] = orig_pre[0];
-  if (mbmi->ref_frame[1])
+  if (has_second_ref(mbmi))
     x->e_mbd.plane[0].pre[1] = orig_pre[1];
 }
 
 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
+                                    const TileInfo *const tile,
                                     BEST_SEG_INFO *bsi_buf, int filter_idx,
                                     int_mv seg_mvs[4][MAX_REF_FRAMES],
                                     int mi_row, int mi_col) {
-  int i, j, br = 0, idx, idy;
+  int i, br = 0, idx, idy;
   int64_t bd = 0, block_sse = 0;
   MB_PREDICTION_MODE this_mode;
   MODE_INFO *mi = x->e_mbd.mi_8x8[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
+  struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
   const int label_count = 4;
   int64_t this_segment_rd = 0;
   int label_mv_thresh;
@@ -1658,9 +1666,10 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
   int mode_idx;
   int subpelmv = 1, have_ref = 0;
+  const int has_second_rf = has_second_ref(mbmi);
 
-  vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
-  vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
+  vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
+  vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
 
   v_fn_ptr = &cpi->fn_ptr[bsize];
 
@@ -1682,17 +1691,17 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
       i = idy * 2 + idx;
 
       frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;
-      frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;
-      vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
+      vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
                                     &frame_mv[NEARESTMV][mbmi->ref_frame[0]],
                                     &frame_mv[NEARMV][mbmi->ref_frame[0]],
                                     i, 0, mi_row, mi_col);
-      if (mbmi->ref_frame[1] > 0)
-        vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
-                                   &frame_mv[NEARESTMV][mbmi->ref_frame[1]],
-                                   &frame_mv[NEARMV][mbmi->ref_frame[1]],
-                                   i, 1, mi_row, mi_col);
-
+      if (has_second_rf) {
+        frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;
+        vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
+                                      &frame_mv[NEARESTMV][mbmi->ref_frame[1]],
+                                      &frame_mv[NEARMV][mbmi->ref_frame[1]],
+                                      i, 1, mi_row, mi_col);
+      }
       // search for the best motion vector on this segment
       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
         const struct buf_2d orig_src = x->plane[0].src;
@@ -1705,7 +1714,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         if ((this_mode == NEARMV || this_mode == NEARESTMV ||
              this_mode == ZEROMV) &&
             frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 &&
-            (mbmi->ref_frame[1] <= 0 ||
+            (!has_second_rf ||
              frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) {
           int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
           int c1 = cost_mv_ref(cpi, NEARMV, rfc);
@@ -1720,7 +1729,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
               continue;
           } else {
             assert(this_mode == ZEROMV);
-            if (mbmi->ref_frame[1] <= 0) {
+            if (!has_second_rf) {
               if ((c3 >= c2 &&
                    frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
                   (c3 >= c1 &&
@@ -1738,14 +1747,14 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           }
         }
 
-        vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
+        vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
                    sizeof(bsi->rdstat[i][mode_idx].ta));
         vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
                    sizeof(bsi->rdstat[i][mode_idx].tl));
 
         // motion search for newmv (single predictor case only)
-        if (mbmi->ref_frame[1] <= 0 && this_mode == NEWMV &&
+        if (!has_second_rf && this_mode == NEWMV &&
             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
           int step_param = 0;
           int further_steps;
@@ -1795,20 +1804,23 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           // adjust src pointer for this block
           mi_buf_shift(x, i);
           if (cpi->sf.search_method == HEX) {
-            bestsme = vp9_hex_search(x, &mvp_full,
+            bestsme = vp9_hex_search(x, &mvp_full.as_mv,
                                      step_param,
                                      sadpb, 1, v_fn_ptr, 1,
-                                     bsi->ref_mv, &mode_mv[NEWMV]);
+                                     &bsi->ref_mv->as_mv,
+                                     &mode_mv[NEWMV].as_mv);
           } else if (cpi->sf.search_method == SQUARE) {
-            bestsme = vp9_square_search(x, &mvp_full,
+            bestsme = vp9_square_search(x, &mvp_full.as_mv,
                                         step_param,
                                         sadpb, 1, v_fn_ptr, 1,
-                                        bsi->ref_mv, &mode_mv[NEWMV]);
+                                        &bsi->ref_mv->as_mv,
+                                        &mode_mv[NEWMV].as_mv);
           } else if (cpi->sf.search_method == BIGDIA) {
-            bestsme = vp9_bigdia_search(x, &mvp_full,
+            bestsme = vp9_bigdia_search(x, &mvp_full.as_mv,
                                         step_param,
                                         sadpb, 1, v_fn_ptr, 1,
-                                        bsi->ref_mv, &mode_mv[NEWMV]);
+                                        &bsi->ref_mv->as_mv,
+                                        &mode_mv[NEWMV].as_mv);
           } else {
             bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                              sadpb, further_steps, 0, v_fn_ptr,
@@ -1839,8 +1851,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           if (bestsme < INT_MAX) {
             int distortion;
             unsigned int sse;
-            cpi->find_fractional_mv_step(x, &mode_mv[NEWMV],
-                                         bsi->ref_mv, x->errorperbit, v_fn_ptr,
+            cpi->find_fractional_mv_step(x,
+                                         &mode_mv[NEWMV].as_mv,
+                                         &bsi->ref_mv->as_mv,
+                                         cpi->common.allow_high_precision_mv,
+                                         x->errorperbit, v_fn_ptr,
                                          0, cpi->sf.subpel_iters_per_step,
                                          x->nmvjointcost, x->mvcost,
                                          &distortion, &sse);
@@ -1856,12 +1871,14 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
-        if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV &&
-            mbmi->interp_filter == EIGHTTAP) {
+        if (has_second_rf) {
           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
             continue;
+        }
 
+        if (has_second_rf && this_mode == NEWMV &&
+            mbmi->interp_filter == EIGHTTAP) {
           // adjust src pointers
           mi_buf_shift(x, i);
           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
@@ -1891,7 +1908,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         if (num_4x4_blocks_high > 1)
           bsi->rdstat[i + 2][mode_idx].mvs[0].as_int =
               mode_mv[this_mode].as_int;
-        if (mbmi->ref_frame[1] > 0) {
+        if (has_second_rf) {
           bsi->rdstat[i][mode_idx].mvs[1].as_int =
               second_mode_mv[this_mode].as_int;
           if (num_4x4_blocks_wide > 1)
@@ -1905,7 +1922,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         // Trap vectors that reach beyond the UMV borders
         if (mv_check_bounds(x, &mode_mv[this_mode]))
           continue;
-        if (mbmi->ref_frame[1] > 0 &&
+        if (has_second_rf &&
             mv_check_bounds(x, &second_mode_mv[this_mode]))
           continue;
 
@@ -1915,7 +1932,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                      (mode_mv[this_mode].as_mv.col & 0x0f);
           have_ref = mode_mv[this_mode].as_int ==
                      ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
-          if (mbmi->ref_frame[1] > 0) {
+          if (has_second_rf) {
             subpelmv |= (second_mode_mv[this_mode].as_mv.row & 0x0f) ||
                         (second_mode_mv[this_mode].as_mv.col & 0x0f);
             have_ref  &= second_mode_mv[this_mode].as_int ==
@@ -1926,7 +1943,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
             ref_bsi = bsi_buf + 1;
             have_ref = mode_mv[this_mode].as_int ==
                        ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
-            if (mbmi->ref_frame[1] > 0) {
+            if (has_second_rf) {
               have_ref  &= second_mode_mv[this_mode].as_int ==
                            ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
             }
@@ -1936,6 +1953,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
             vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
                        sizeof(SEG_RDSTAT));
+            if (num_4x4_blocks_wide > 1)
+              bsi->rdstat[i + 1][mode_idx].eobs =
+                  ref_bsi->rdstat[i + 1][mode_idx].eobs;
+            if (num_4x4_blocks_high > 1)
+              bsi->rdstat[i + 2][mode_idx].eobs =
+                  ref_bsi->rdstat[i + 2][mode_idx].eobs;
+
             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
               mode_selected = this_mode;
               best_rd = bsi->rdstat[i][mode_idx].brdcost;
@@ -1956,7 +1980,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
                                             bsi->rdstat[i][mode_idx].brate, 0);
           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
-          bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i];
+          bsi->rdstat[i][mode_idx].eobs = pd->eobs[i];
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[i + 1][mode_idx].eobs = pd->eobs[i + 1];
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[i + 2][mode_idx].eobs = pd->eobs[i + 2];
         }
 
         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
@@ -1997,15 +2025,6 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         bsi->segment_rd = INT64_MAX;
         return;
       }
-
-      for (j = 1; j < num_4x4_blocks_high; ++j)
-        vpx_memcpy(&x->partition_info->bmi[i + j * 2],
-                   &x->partition_info->bmi[i],
-                   sizeof(x->partition_info->bmi[i]));
-      for (j = 1; j < num_4x4_blocks_wide; ++j)
-        vpx_memcpy(&x->partition_info->bmi[i + j],
-                   &x->partition_info->bmi[i],
-                   sizeof(x->partition_info->bmi[i]));
     }
   } /* for each label */
 
@@ -2017,10 +2036,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
   // update the coding decisions
   for (i = 0; i < 4; ++i)
-    bsi->modes[i] = x->partition_info->bmi[i].mode;
+    bsi->modes[i] = mi->bmi[i].as_mode;
 }
 
 static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
+                                           const TileInfo *const tile,
                                            int_mv *best_ref_mv,
                                            int_mv *second_best_ref_mv,
                                            int64_t best_rd,
@@ -2036,7 +2056,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   int i;
   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
   MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->this_mi;
+  MODE_INFO *mi = xd->mi_8x8[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
   int mode_idx;
 
@@ -2051,7 +2071,8 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   for (i = 0; i < 4; i++)
     bsi->modes[i] = ZEROMV;
 
-  rd_check_segment_txsize(cpi, x, bsi_buf, filter_idx, seg_mvs, mi_row, mi_col);
+  rd_check_segment_txsize(cpi, x, tile, bsi_buf, filter_idx, seg_mvs,
+                          mi_row, mi_col);
 
   if (bsi->segment_rd > best_rd)
     return INT64_MAX;
@@ -2059,10 +2080,10 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   for (i = 0; i < 4; i++) {
     mode_idx = inter_mode_offset(bsi->modes[i]);
     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
-    if (mbmi->ref_frame[1] > 0)
+    if (has_second_ref(mbmi))
       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
     xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
-    x->partition_info->bmi[i].mode = bsi->modes[i];
+    mi->bmi[i].as_mode = bsi->modes[i];
   }
 
   /*
@@ -2082,7 +2103,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                     uint8_t *ref_y_buffer, int ref_y_stride,
                     int ref_frame, BLOCK_SIZE block_size ) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   int_mv this_mv;
   int i;
   int zero_seen = 0;
@@ -2195,22 +2216,18 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
 
 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int mode_index,
-                         PARTITION_INFO *partition,
                          int_mv *ref_mv,
                          int_mv *second_ref_mv,
                          int64_t comp_pred_diff[NB_PREDICTION_TYPES],
                          int64_t tx_size_diff[TX_MODES],
-                         int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]) {
+                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
   ctx->skip = x->skip;
   ctx->best_mode_index = mode_index;
-  ctx->mic = *xd->this_mi;
-
-  if (partition)
-    ctx->partition_info = *partition;
+  ctx->mic = *xd->mi_8x8[0];
 
   ctx->best_ref_mv.as_int = ref_mv->as_int;
   ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
@@ -2219,11 +2236,9 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   ctx->comp_pred_diff   = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
   ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
 
-  // FIXME(rbultje) does this memcpy the whole array? I believe sizeof()
-  // doesn't actually work this way
-  memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
-  memcpy(ctx->best_filter_diff, best_filter_diff,
-         sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1));
+  vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
+  vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
+             sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
 }
 
 static void setup_pred_block(const MACROBLOCKD *xd,
@@ -2253,6 +2268,7 @@ static void setup_pred_block(const MACROBLOCKD *xd,
 }
 
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+                               const TileInfo *const tile,
                                int idx, MV_REFERENCE_FRAME frame_type,
                                BLOCK_SIZE block_size,
                                int mi_row, int mi_col,
@@ -2263,17 +2279,13 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
   VP9_COMMON *cm = &cpi->common;
   YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
 
   // set up scaling factors
   scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
 
-  scale[frame_type].x_offset_q4 =
-      ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].x_scale_fp,
-       REF_SCALE_SHIFT) & 0xf;
-  scale[frame_type].y_offset_q4 =
-      ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].y_scale_fp,
-       REF_SCALE_SHIFT) & 0xf;
+  scale[frame_type].sfc->set_scaled_offsets(&scale[frame_type],
+                                            mi_row * MI_SIZE, mi_col * MI_SIZE);
 
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
@@ -2281,13 +2293,13 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                    &scale[frame_type], &scale[frame_type]);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  vp9_find_mv_refs(&cpi->common, xd, xd->this_mi,
+  vp9_find_mv_refs(cm, xd, tile, xd->mi_8x8[0],
                    xd->last_mi,
                    frame_type,
                    mbmi->ref_mvs[frame_type], mi_row, mi_col);
 
   // Candidate refinement carried out at encoder and decoder
-  vp9_find_best_ref_mvs(xd,
+  vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv,
                         mbmi->ref_mvs[frame_type],
                         &frame_nearest_mv[frame_type],
                         &frame_near_mv[frame_type]);
@@ -2295,7 +2307,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
   // Further refinement that is encode side only to test the top few candidates
   // in full and choose the best as the centre point for subsequent searches.
   // The current implementation doesn't support scaling.
-  if (!vp9_is_scaled(&scale[frame_type]))
+  if (!vp9_is_scaled(scale[frame_type].sfc) && block_size >= BLOCK_8X8)
     mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
             frame_type, block_size);
 }
@@ -2311,19 +2323,20 @@ static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
 
 static INLINE int get_switchable_rate(const MACROBLOCK *x) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+  const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   return SWITCHABLE_INTERP_RATE_FACTOR *
              x->switchable_interp_costs[ctx][mbmi->interp_filter];
 }
 
 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                 const TileInfo *const tile,
                                  BLOCK_SIZE bsize,
                                  int mi_row, int mi_col,
                                  int_mv *tmp_mv, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   VP9_COMMON *cm = &cpi->common;
-  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   int bestsme = INT_MAX;
   int further_steps, step_param;
@@ -2402,23 +2415,23 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
   if (cpi->sf.search_method == HEX) {
-    bestsme = vp9_hex_search(x, &mvp_full,
+    bestsme = vp9_hex_search(x, &mvp_full.as_mv,
                              step_param,
                              sadpb, 1,
                              &cpi->fn_ptr[block_size], 1,
-                             &ref_mv, tmp_mv);
+                             &ref_mv.as_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == SQUARE) {
-    bestsme = vp9_square_search(x, &mvp_full,
+    bestsme = vp9_square_search(x, &mvp_full.as_mv,
                                 step_param,
                                 sadpb, 1,
                                 &cpi->fn_ptr[block_size], 1,
-                                &ref_mv, tmp_mv);
+                                &ref_mv.as_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == BIGDIA) {
-    bestsme = vp9_bigdia_search(x, &mvp_full,
+    bestsme = vp9_bigdia_search(x, &mvp_full.as_mv,
                                 step_param,
                                 sadpb, 1,
                                 &cpi->fn_ptr[block_size], 1,
-                                &ref_mv, tmp_mv);
+                                &ref_mv.as_mv, &tmp_mv->as_mv);
   } else {
     bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                      sadpb, further_steps, 1,
@@ -2434,16 +2447,16 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   if (bestsme < INT_MAX) {
     int dis;  /* TODO: use dis in distortion calculation later. */
     unsigned int sse;
-    cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv,
+    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+                                 cm->allow_high_precision_mv,
                                  x->errorperbit,
                                  &cpi->fn_ptr[block_size],
                                  0, cpi->sf.subpel_iters_per_step,
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &sse);
   }
-  *rate_mv = vp9_mv_bit_cost(tmp_mv, &ref_mv,
-                             x->nmvjointcost, x->mvcost,
-                             96);
+  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
+                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
   if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
     x->pred_mv[ref].as_int = tmp_mv->as_int;
@@ -2463,7 +2476,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                 int *rate_mv) {
   int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   int refs[2] = { mbmi->ref_frame[0],
     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv ref_mv[2];
@@ -2499,12 +2512,12 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     for (i = 0; i < MAX_MB_PLANE; i++)
       backup_second_yv12[i] = xd->plane[i].pre[1];
 
-    setup_pre_planes(xd, 0, scaled_ref_frame[1], mi_row, mi_col, NULL);
+    setup_pre_planes(xd, 1, scaled_ref_frame[1], mi_row, mi_col, NULL);
   }
 
-  xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+  xd->scale_factor[0].sfc->set_scaled_offsets(&xd->scale_factor[0],
                                          mi_row, mi_col);
-  xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+  xd->scale_factor[1].sfc->set_scaled_offsets(&xd->scale_factor[1],
                                          mi_row, mi_col);
   scaled_first_yv12 = xd->plane[0].pre[0];
 
@@ -2569,8 +2582,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
       unsigned int sse;
 
       bestsme = cpi->find_fractional_mv_step_comp(
-          x, &tmp_mv,
-          &ref_mv[id],
+          x, &tmp_mv.as_mv,
+          &ref_mv[id].as_mv,
+          cpi->common.allow_high_precision_mv,
           x->errorperbit,
           &cpi->fn_ptr[block_size],
           0, cpi->sf.subpel_iters_per_step,
@@ -2602,17 +2616,18 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     for (i = 0; i < MAX_MB_PLANE; i++)
       xd->plane[i].pre[1] = backup_second_yv12[i];
   }
-  *rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]],
-                              &mbmi->ref_mvs[refs[0]][0],
-                              x->nmvjointcost, x->mvcost, 96);
-  *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]],
-                              &mbmi->ref_mvs[refs[1]][0],
-                              x->nmvjointcost, x->mvcost, 96);
+  *rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                              &mbmi->ref_mvs[refs[0]][0].as_mv,
+                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+  *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+                              &mbmi->ref_mvs[refs[1]][0].as_mv,
+                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
   vpx_free(second_pred);
 }
 
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                 const TileInfo *const tile,
                                  BLOCK_SIZE bsize,
                                  int64_t txfm_cache[],
                                  int *rate2, int64_t *distortion,
@@ -2620,7 +2635,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  int *rate_y, int64_t *distortion_y,
                                  int *rate_uv, int64_t *distortion_uv,
                                  int *mode_excluded, int *disable_skip,
-                                 INTERPOLATIONFILTERTYPE *best_filter,
+                                 INTERPOLATION_TYPE *best_filter,
                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
                                  int mi_row, int mi_col,
                                  int_mv single_newmv[MAX_REF_FRAMES],
@@ -2628,8 +2643,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  const int64_t ref_best_rd) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
-  const int is_comp_pred = (mbmi->ref_frame[1] > 0);
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  const int is_comp_pred = has_second_ref(mbmi);
   const int num_refs = is_comp_pred ? 2 : 1;
   const int this_mode = mbmi->mode;
   int_mv *frame_mv = mode_mv[this_mode];
@@ -2647,6 +2662,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
 
+  if (is_comp_pred) {
+    if (frame_mv[refs[0]].as_int == INVALID_MV ||
+        frame_mv[refs[1]].as_int == INVALID_MV)
+      return INT64_MAX;
+  }
+
   if (this_mode == NEWMV) {
     int rate_mv;
     if (is_comp_pred) {
@@ -2658,23 +2679,21 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         joint_motion_search(cpi, x, bsize, frame_mv,
                             mi_row, mi_col, single_newmv, &rate_mv);
       } else {
-        rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]],
-                                   &mbmi->ref_mvs[refs[0]][0],
-                                   x->nmvjointcost, x->mvcost, 96);
-        rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]],
-                                   &mbmi->ref_mvs[refs[1]][0],
-                                   x->nmvjointcost, x->mvcost, 96);
+        rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                   &mbmi->ref_mvs[refs[0]][0].as_mv,
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+        rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+                                   &mbmi->ref_mvs[refs[1]][0].as_mv,
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
-      if (frame_mv[refs[0]].as_int == INVALID_MV ||
-          frame_mv[refs[1]].as_int == INVALID_MV)
-        return INT64_MAX;
       *rate2 += rate_mv;
     } else {
       int_mv tmp_mv;
-      single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
+      single_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+                           &tmp_mv, &rate_mv);
       *rate2 += rate_mv;
       frame_mv[refs[0]].as_int =
-          xd->this_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+          xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
       single_newmv[refs[0]].as_int = tmp_mv.as_int;
     }
   }
@@ -2904,7 +2923,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       unsigned int thresh_ac;
       // The encode_breakout input
       unsigned int encode_breakout = x->encode_breakout << 4;
-      int max_thresh = 36000;
+      unsigned int max_thresh = 36000;
 
       // Use extreme low threshold for static frames to limit skipping.
       if (cpi->enable_encode_breakout == 2)
@@ -3001,7 +3020,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
 
-    super_block_uvrd(cm, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
+    super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
                      bsize, ref_best_rd - rdcosty);
     if (*rate_uv == INT_MAX) {
       *rate2 = INT_MAX;
@@ -3038,7 +3057,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
   x->skip_encode = 0;
   ctx->skip = 0;
-  xd->this_mi->mbmi.ref_frame[0] = INTRA_FRAME;
+  xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
   if (bsize >= BLOCK_8X8) {
     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                &dist_y, &y_skip, bsize, tx_cache,
@@ -3070,14 +3089,19 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
     *returndist = dist_y + dist_uv;
     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
-      for (i = 0; i < TX_MODES; i++)
-        ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
+      for (i = 0; i < TX_MODES; i++) {
+        if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX)
+          ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
+        else
+          ctx->tx_rd_diff[i] = 0;
+      }
   }
 
-  ctx->mic = *xd->this_mi;
+  ctx->mic = *xd->mi_8x8[0];
 }
 
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                                  const TileInfo *const tile,
                                   int mi_row, int mi_col,
                                   int *returnrate,
                                   int64_t *returndistortion,
@@ -3086,10 +3110,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int64_t best_rd_so_far) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   const struct segmentation *seg = &cm->seg;
   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  RD_PREDICTION_MODE this_mode;
+  MB_PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i;
@@ -3103,13 +3127,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                      cpi->gld_fb_idx,
                      cpi->alt_fb_idx};
   int64_t best_rd = best_rd_so_far;
-  int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_tx_rd[TX_MODES];
   int64_t best_tx_diff[TX_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   int64_t best_pred_rd[NB_PREDICTION_TYPES];
-  int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
-  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode = { 0 };
   int j;
   int mode_index, best_mode_index = 0;
@@ -3118,9 +3141,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_intra_rd = INT64_MAX;
   int64_t best_inter_rd = INT64_MAX;
   MB_PREDICTION_MODE best_intra_mode = DC_PRED;
-  // MB_PREDICTION_MODE best_inter_mode = ZEROMV;
   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
-  INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
+  INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE;
   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
@@ -3130,22 +3152,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   unsigned int mode_mask = 0;
   int64_t mode_distortions[MB_MODE_COUNT] = {-1};
   int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
-  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
-                                             cpi->common.y_dc_delta_q);
-  int_mv seg_mvs[4][MAX_REF_FRAMES];
-  union b_mode_info best_bmodes[4];
-  PARTITION_INFO best_partition;
+  int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
   const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   int best_skip2 = 0;
 
   x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
 
-  for (i = 0; i < 4; i++) {
-    int j;
-    for (j = 0; j < MAX_REF_FRAMES; j++)
-      seg_mvs[i][j].as_int = INVALID_MV;
-  }
   // Everywhere the flag is set the error is much higher than its neighbors.
   ctx->frames_with_high_error = 0;
   ctx->modes_with_high_error = 0;
@@ -3157,7 +3170,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_MODES; i++)
     best_tx_rd[i] = INT64_MAX;
-  for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
     best_filter_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
@@ -3199,8 +3212,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
-                         mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
+      setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame,
+                         block_size, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV],
                          yv12_mb, scale_factor);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
@@ -3251,7 +3265,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
             assert(!"Invalid Reference frame");
         }
       }
-      if (cpi->mode_skip_mask & (1 << mode_index))
+      if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
         continue;
     }
 
@@ -3261,9 +3275,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       continue;
 
     // Test best rd so far against threshold for trying this mode.
-    if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] *
-                     cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 5)) ||
-        cpi->rd_threshes[bsize][mode_index] == INT_MAX)
+    if ((best_rd < ((int64_t)cpi->rd_threshes[segment_id][bsize][mode_index] *
+                     cpi->rd_thresh_freq_fact[bsize][mode_index] >> 5)) ||
+        cpi->rd_threshes[segment_id][bsize][mode_index] == INT_MAX)
       continue;
 
     // Do not allow compound prediction if the segment level reference
@@ -3313,25 +3327,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
             second_ref_frame != best_inter_ref_frame)
           continue;
     }
-    // TODO(jingning, jkoleszar): scaling reference frame not supported for
-    // SPLITMV.
-    if (ref_frame > 0 &&
-        vp9_is_scaled(&scale_factor[ref_frame]) &&
-        this_mode == RD_SPLITMV)
-      continue;
-
-    if (second_ref_frame > 0 &&
-        vp9_is_scaled(&scale_factor[second_ref_frame]) &&
-        this_mode == RD_SPLITMV)
-      continue;
-
-    if (bsize >= BLOCK_8X8 &&
-        (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
-      continue;
-
-    if (bsize < BLOCK_8X8 &&
-        !(this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
-      continue;
 
     set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
     mbmi->uv_mode = DC_PRED;
@@ -3339,7 +3334,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
     mbmi->interp_filter = cm->mcomp_filter_type;
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
 
     if (comp_pred) {
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
@@ -3373,7 +3368,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     // If the segment skip feature is enabled....
     // then do nothing if the current mode is not allowed..
     } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != RD_ZEROMV && ref_frame != INTRA_FRAME)) {
+               (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
       continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
@@ -3385,11 +3380,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       // an unfiltered alternative. We allow near/nearest as well
       // because they may result in zero-zero MVs but be cheaper.
       if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if ((this_mode != RD_ZEROMV &&
-             !(this_mode == RD_NEARMV &&
-               frame_mv[RD_NEARMV][ALTREF_FRAME].as_int == 0) &&
-             !(this_mode == RD_NEARESTMV &&
-               frame_mv[RD_NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
+        if ((this_mode != ZEROMV &&
+             !(this_mode == NEARMV &&
+               frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
+             !(this_mode == NEARESTMV &&
+               frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
             ref_frame != ALTREF_FRAME) {
           continue;
         }
@@ -3401,7 +3396,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     // a representative block in the boundary ( first ) and then implement a
     // function that does sads when inside the border..
     if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
-        this_mode == RD_NEWMV) {
+        this_mode == NEWMV) {
       continue;
     }
 
@@ -3411,58 +3406,27 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     cpi->mode_test_hits[bsize]++;
 #endif
 
-    if (this_mode == RD_I4X4_PRED) {
-      int rate;
-
-      /*
-      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-          (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME))
-        continue;
-        */
 
-      // RD_I4X4_PRED is only considered for block sizes less than 8x8.
-      mbmi->tx_size = TX_4X4;
-      if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
-                                       &distortion_y, best_rd) >= best_rd)
-        continue;
-      rate2 += rate;
-      rate2 += intra_cost_penalty;
-      distortion2 += distortion_y;
-
-      if (rate_uv_intra[TX_4X4] == INT_MAX) {
-        choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
-                             &rate_uv_tokenonly[TX_4X4],
-                             &dist_uv[TX_4X4], &skip_uv[TX_4X4],
-                             &mode_uv[TX_4X4]);
-      }
-      rate2 += rate_uv_intra[TX_4X4];
-      rate_uv = rate_uv_tokenonly[TX_4X4];
-      distortion2 += dist_uv[TX_4X4];
-      distortion_uv = dist_uv[TX_4X4];
-      mbmi->uv_mode = mode_uv[TX_4X4];
-      tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-      for (i = 0; i < TX_MODES; ++i)
-        tx_cache[i] = tx_cache[ONLY_4X4];
-    } else if (ref_frame == INTRA_FRAME) {
+    if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       // Disable intra modes other than DC_PRED for blocks with low variance
       // Threshold for intra skipping based on source variance
       // TODO(debargha): Specialize the threshold for super block sizes
-      static const int skip_intra_var_thresh[BLOCK_SIZES] = {
+      static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = {
         64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
       };
       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
-          this_mode != RD_DC_PRED &&
+          this_mode != DC_PRED &&
           x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
         continue;
       // Only search the oblique modes if the best so far is
       // one of the neighboring directional modes
       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-          (this_mode >= RD_D45_PRED && this_mode <= RD_TM_PRED)) {
+          (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
         if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
           continue;
       }
-      mbmi->mode = rd_mode_to_mode(this_mode);
+      mbmi->mode = this_mode;
       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
         if (conditional_skipintra(mbmi->mode, best_intra_mode))
             continue;
@@ -3488,11 +3452,631 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       mbmi->uv_mode = mode_uv[uv_tx];
 
       rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
-      if (this_mode != RD_DC_PRED && this_mode != RD_TM_PRED)
+      if (this_mode != DC_PRED && this_mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
-    } else if (this_mode == RD_SPLITMV) {
-      const int is_comp_pred = second_ref_frame > 0;
+    } else {
+      mbmi->mode = this_mode;
+      compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
+      this_rd = handle_inter_mode(cpi, x, tile, bsize,
+                                  tx_cache,
+                                  &rate2, &distortion2, &skippable,
+                                  &rate_y, &distortion_y,
+                                  &rate_uv, &distortion_uv,
+                                  &mode_excluded, &disable_skip,
+                                  &tmp_best_filter, frame_mv,
+                                  mi_row, mi_col,
+                                  single_newmv, &total_sse, best_rd);
+      if (this_rd == INT64_MAX)
+        continue;
+    }
+
+    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+      rate2 += compmode_cost;
+    }
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    if (second_ref_frame > INTRA_FRAME) {
+      rate2 += ref_costs_comp[ref_frame];
+    } else {
+      rate2 += ref_costs_single[ref_frame];
+    }
+
+    if (!disable_skip) {
+      // Test for the condition where skip block will be activated
+      // because there are no non zero coefficients and make any
+      // necessary adjustment for rate. Ignore if skip is coded at
+      // segment level as the cost wont have been added in.
+      // Is Mb level skip allowed (i.e. not coded at segment level).
+      const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
+                                                         SEG_LVL_SKIP);
+
+      if (skippable) {
+        // Back out the coefficient coding costs
+        rate2 -= (rate_y + rate_uv);
+        // for best yrd calculation
+        rate_uv = 0;
+
+        if (mb_skip_allowed) {
+          int prob_skip_cost;
+
+          // Cost the skip mb case
+          vp9_prob skip_prob =
+            vp9_get_pred_prob_mbskip(cm, xd);
+
+          if (skip_prob) {
+            prob_skip_cost = vp9_cost_bit(skip_prob, 1);
+            rate2 += prob_skip_cost;
+          }
+        }
+      } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
+        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+          // Add in the cost of the no skip flag.
+          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+                                            0);
+          rate2 += prob_skip_cost;
+        } else {
+          // FIXME(rbultje) make this work for splitmv also
+          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+                                            1);
+          rate2 += prob_skip_cost;
+          distortion2 = total_sse;
+          assert(total_sse >= 0);
+          rate2 -= (rate_y + rate_uv);
+          rate_y = 0;
+          rate_uv = 0;
+          this_skip2 = 1;
+        }
+      } else if (mb_skip_allowed) {
+        // Add in the cost of the no skip flag.
+        int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+                                          0);
+        rate2 += prob_skip_cost;
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    }
+
+    // Keep record of best intra rd
+    if (!is_inter_block(&xd->mi_8x8[0]->mbmi) &&
+        this_rd < best_intra_rd) {
+      best_intra_rd = this_rd;
+      best_intra_mode = xd->mi_8x8[0]->mbmi.mode;
+    }
+
+    // Keep record of best inter rd with single reference
+    if (is_inter_block(&xd->mi_8x8[0]->mbmi) &&
+        !has_second_ref(&xd->mi_8x8[0]->mbmi) &&
+        !mode_excluded && this_rd < best_inter_rd) {
+      best_inter_rd = this_rd;
+      best_inter_ref_frame = ref_frame;
+    }
+
+    if (!disable_skip && ref_frame == INTRA_FRAME) {
+      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+        best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
+    }
+
+    // Store the respective mode distortions for later use.
+    if (mode_distortions[this_mode] == -1
+        || distortion2 < mode_distortions[this_mode]) {
+      mode_distortions[this_mode] = distortion2;
+    }
+    if (frame_distortions[ref_frame] == -1
+        || distortion2 < frame_distortions[ref_frame]) {
+      frame_distortions[ref_frame] = distortion2;
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      if (!mode_excluded) {
+        // Note index of best mode so far
+        best_mode_index = mode_index;
+
+        if (ref_frame == INTRA_FRAME) {
+          /* required for left and above block mv */
+          mbmi->mv[0].as_int = 0;
+        }
+
+        *returnrate = rate2;
+        *returndistortion = distortion2;
+        best_rd = this_rd;
+        best_mbmode = *mbmi;
+        best_skip2 = this_skip2;
+        vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+                   sizeof(uint8_t) * ctx->num_4x4_blk);
+
+        // TODO(debargha): enhance this test with a better distortion prediction
+        // based on qp, activity mask and history
+        if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+            (mode_index > MIN_EARLY_TERM_INDEX)) {
+          const int qstep = xd->plane[0].dequant[1];
+          // TODO(debargha): Enhance this by specializing for each mode_index
+          int scale = 4;
+          if (x->source_variance < UINT_MAX) {
+            const int var_adjust = (x->source_variance < 16);
+            scale -= var_adjust;
+          }
+          if (ref_frame > INTRA_FRAME &&
+              distortion2 * scale < qstep * qstep) {
+            early_term = 1;
+          }
+        }
+      }
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && ref_frame != INTRA_FRAME) {
+      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+      if (second_ref_frame <= INTRA_FRAME &&
+          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
+        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
+      } else if (second_ref_frame > INTRA_FRAME &&
+                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
+        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
+      }
+      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
+        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+    }
+
+    /* keep record of best filter type */
+    if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
+        cm->mcomp_filter_type != BILINEAR) {
+      int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
+                              SWITCHABLE_FILTERS : cm->mcomp_filter_type];
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+        int64_t adj_rd;
+        // In cases of poor prediction, filter_cache[] can contain really big
+        // values, which actually are bigger than this_rd itself. This can
+        // cause negative best_filter_rd[] values, which is obviously silly.
+        // Therefore, if filter_cache < ref, we do an adjusted calculation.
+        if (cpi->rd_filter_cache[i] >= ref) {
+          adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
+        } else {
+          // FIXME(rbultje) do this for comppsred also
+          //
+          // To prevent out-of-range computation in
+          //    adj_rd = cpi->rd_filter_cache[i] * this_rd / ref
+          // cpi->rd_filter_cache[i] / ref is converted to a 256 based ratio.
+          int tmp = cpi->rd_filter_cache[i] * 256 / ref;
+          adj_rd = (this_rd * tmp) >> 8;
+        }
+        best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
+      }
+    }
+
+    /* keep record of best txfm size */
+    if (bsize < BLOCK_32X32) {
+      if (bsize < BLOCK_16X16)
+        tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
+
+      tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
+    }
+    if (!mode_excluded && this_rd != INT64_MAX) {
+      for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
+        int64_t adj_rd = INT64_MAX;
+        adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
+
+        if (adj_rd < best_tx_rd[i])
+          best_tx_rd[i] = adj_rd;
+      }
+    }
+
+    if (early_term)
+      break;
+
+    if (x->skip && !comp_pred)
+      break;
+  }
+
+  if (best_rd >= best_rd_so_far)
+    return INT64_MAX;
+
+  // If we used an estimate for the uv intra rd in the loop above...
+  if (cpi->sf.use_uv_intra_rd_estimate) {
+    // Do Intra UV best rd mode selection if best mode choice above was intra.
+    if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
+      TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
+                              &rate_uv_tokenonly[uv_tx_size],
+                              &dist_uv[uv_tx_size],
+                              &skip_uv[uv_tx_size],
+                              bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+    }
+  }
+
+  // If we are using reference masking and the set mask flag is set then
+  // create the reference frame mask.
+  if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
+    cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame);
+
+  // Flag all modes that have a distortion thats > 2x the best we found at
+  // this level.
+  for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
+    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
+      continue;
+
+    if (mode_distortions[mode_index] > 2 * *returndistortion) {
+      ctx->modes_with_high_error |= (1 << mode_index);
+    }
+  }
+
+  // Flag all ref frames that have a distortion thats > 2x the best we found at
+  // this level.
+  for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (frame_distortions[ref_frame] > 2 * *returndistortion) {
+      ctx->frames_with_high_error |= (1 << ref_frame);
+    }
+  }
+
+  assert((cm->mcomp_filter_type == SWITCHABLE) ||
+         (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
+         (best_mbmode.ref_frame[0] == INTRA_FRAME));
+
+  // Updating rd_thresh_freq_fact[] here means that the different
+  // partition/block sizes are handled independently based on the best
+  // choice for the current partition. It may well be better to keep a scaled
+  // best rd so far value and update rd_thresh_freq_fact based on the mode/size
+  // combination that wins out.
+  if (cpi->sf.adaptive_rd_thresh) {
+    for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
+      if (mode_index == best_mode_index) {
+        cpi->rd_thresh_freq_fact[bsize][mode_index] -=
+          (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
+      } else {
+        cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
+        if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
+            (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
+          cpi->rd_thresh_freq_fact[bsize][mode_index] =
+            cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
+        }
+      }
+    }
+  }
+
+  // macroblock modes
+  *mbmi = best_mbmode;
+  x->skip |= best_skip2;
+
+  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+    if (best_pred_rd[i] == INT64_MAX)
+      best_pred_diff[i] = INT_MIN;
+    else
+      best_pred_diff[i] = best_rd - best_pred_rd[i];
+  }
+
+  if (!x->skip) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+      if (best_filter_rd[i] == INT64_MAX)
+        best_filter_diff[i] = 0;
+      else
+        best_filter_diff[i] = best_rd - best_filter_rd[i];
+    }
+    if (cm->mcomp_filter_type == SWITCHABLE)
+      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
+  } else {
+    vp9_zero(best_filter_diff);
+  }
+
+  if (!x->skip) {
+    for (i = 0; i < TX_MODES; i++) {
+      if (best_tx_rd[i] == INT64_MAX)
+        best_tx_diff[i] = 0;
+      else
+        best_tx_diff[i] = best_rd - best_tx_rd[i];
+    }
+  } else {
+    vp9_zero(best_tx_diff);
+  }
+
+  set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
+                    scale_factor);
+  store_coding_context(x, ctx, best_mode_index,
+                       &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
+                       &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
+                                      mbmi->ref_frame[1]][0],
+                       best_pred_diff, best_tx_diff, best_filter_diff);
+
+  return best_rd;
+}
+
+
+int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                      const TileInfo *const tile,
+                                      int mi_row, int mi_col,
+                                      int *returnrate,
+                                      int64_t *returndistortion,
+                                      BLOCK_SIZE bsize,
+                                      PICK_MODE_CONTEXT *ctx,
+                                      int64_t best_rd_so_far) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  const struct segmentation *seg = &cm->seg;
+  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+  unsigned char segment_id = mbmi->segment_id;
+  int comp_pred, i;
+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int idx_list[4] = {0,
+                     cpi->lst_fb_idx,
+                     cpi->gld_fb_idx,
+                     cpi->alt_fb_idx};
+  int64_t best_rd = best_rd_so_far;
+  int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
+  int64_t best_tx_rd[TX_MODES];
+  int64_t best_tx_diff[TX_MODES];
+  int64_t best_pred_diff[NB_PREDICTION_TYPES];
+  int64_t best_pred_rd[NB_PREDICTION_TYPES];
+  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  MB_MODE_INFO best_mbmode = { 0 };
+  int mode_index, best_mode_index = 0;
+  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  vp9_prob comp_mode_p;
+  int64_t best_inter_rd = INT64_MAX;
+  MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
+  INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE;
+  int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
+  int64_t dist_uv[TX_SIZES];
+  int skip_uv[TX_SIZES];
+  MB_PREDICTION_MODE mode_uv[TX_SIZES] = { 0 };
+  struct scale_factors scale_factor[4];
+  unsigned int ref_frame_mask = 0;
+  unsigned int mode_mask = 0;
+  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
+                                             cpi->common.y_dc_delta_q);
+  int_mv seg_mvs[4][MAX_REF_FRAMES];
+  b_mode_info best_bmodes[4];
+  int best_skip2 = 0;
+
+  x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
+  vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
+
+  for (i = 0; i < 4; i++) {
+    int j;
+    for (j = 0; j < MAX_REF_FRAMES; j++)
+      seg_mvs[i][j].as_int = INVALID_MV;
+  }
+
+  estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
+
+  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+    best_pred_rd[i] = INT64_MAX;
+  for (i = 0; i < TX_MODES; i++)
+    best_tx_rd[i] = INT64_MAX;
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    best_filter_rd[i] = INT64_MAX;
+  for (i = 0; i < TX_SIZES; i++)
+    rate_uv_intra[i] = INT_MAX;
+
+  *returnrate = INT_MAX;
+
+  // Create a mask set to 1 for each reference frame used by a smaller
+  // resolution.
+  if (cpi->sf.use_avoid_tested_higherror) {
+    ref_frame_mask = 0;
+    mode_mask = 0;
+    ref_frame_mask = ~ref_frame_mask;
+    mode_mask = ~mode_mask;
+  }
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame,
+                         block_size, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV],
+                         yv12_mb, scale_factor);
+    }
+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[ZEROMV][ref_frame].as_int = 0;
+  }
+
+  for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
+    int mode_excluded = 0;
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int compmode_cost = 0;
+    int rate2 = 0, rate_y = 0, rate_uv = 0;
+    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+    int skippable = 0;
+    int64_t tx_cache[TX_MODES];
+    int i;
+    int this_skip2 = 0;
+    int64_t total_sse = INT_MAX;
+    int early_term = 0;
+
+    for (i = 0; i < TX_MODES; ++i)
+      tx_cache[i] = INT64_MAX;
+
+    x->skip = 0;
+    ref_frame = vp9_ref_order[mode_index].ref_frame;
+    second_ref_frame = vp9_ref_order[mode_index].second_ref_frame;
+
+    // Look at the reference frame of the best mode so far and set the
+    // skip mask to look at a subset of the remaining modes.
+    if (mode_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
+      if (mode_index == 3) {
+        switch (vp9_ref_order[best_mode_index].ref_frame) {
+          case INTRA_FRAME:
+            cpi->mode_skip_mask = 0;
+            break;
+          case LAST_FRAME:
+            cpi->mode_skip_mask = 0x0010;
+            break;
+          case GOLDEN_FRAME:
+            cpi->mode_skip_mask = 0x0008;
+            break;
+          case ALTREF_FRAME:
+            cpi->mode_skip_mask = 0x0000;
+            break;
+          case NONE:
+          case MAX_REF_FRAMES:
+            assert(!"Invalid Reference frame");
+        }
+      }
+      if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
+        continue;
+    }
+
+    // Skip if the current reference frame has been masked off
+    if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
+        (cpi->ref_frame_mask & (1 << ref_frame)))
+      continue;
+
+    // Test best rd so far against threshold for trying this mode.
+    if ((best_rd <
+         ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] *
+          cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 5)) ||
+        cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] == INT_MAX)
+      continue;
+
+    // Do not allow compound prediction if the segment level reference
+    // frame feature is in use as in this case there can only be one reference.
+    if ((second_ref_frame > INTRA_FRAME) &&
+         vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+      continue;
+
+    mbmi->ref_frame[0] = ref_frame;
+    mbmi->ref_frame[1] = second_ref_frame;
+
+    if (!(ref_frame == INTRA_FRAME
+        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
+      continue;
+    }
+    if (!(second_ref_frame == NONE
+        || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
+      continue;
+    }
+
+    comp_pred = second_ref_frame > INTRA_FRAME;
+    if (comp_pred) {
+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
+        if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME)
+          continue;
+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
+        if (ref_frame != best_inter_ref_frame &&
+            second_ref_frame != best_inter_ref_frame)
+          continue;
+    }
+
+    // TODO(jingning, jkoleszar): scaling reference frame not supported for
+    // sub8x8 blocks.
+    if (ref_frame > 0 &&
+        vp9_is_scaled(scale_factor[ref_frame].sfc))
+      continue;
+
+    if (second_ref_frame > 0 &&
+        vp9_is_scaled(scale_factor[second_ref_frame].sfc))
+      continue;
+
+    set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
+    mbmi->uv_mode = DC_PRED;
+
+    // Evaluate all sub-pel filters irrespective of whether we can use
+    // them for this frame.
+    mbmi->interp_filter = cm->mcomp_filter_type;
+    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+    if (comp_pred) {
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
+        continue;
+      set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
+
+      mode_excluded = mode_excluded
+                         ? mode_excluded
+                         : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+    } else {
+      if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
+        mode_excluded =
+            mode_excluded ?
+                mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+      }
+    }
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred)
+        xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
+            (int)ref_frame) {
+      continue;
+    // If the segment skip feature is enabled....
+    // then do nothing if the current mode is not allowed..
+    } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
+               ref_frame != INTRA_FRAME) {
+      continue;
+    // Disable this drop out case if the ref frame
+    // segment level feature is enabled for this segment. This is to
+    // prevent the possibility that we end up unable to pick any mode.
+    } else if (!vp9_segfeature_active(seg, segment_id,
+                                      SEG_LVL_REF_FRAME)) {
+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+      // unless ARNR filtering is enabled in which case we want
+      // an unfiltered alternative. We allow near/nearest as well
+      // because they may result in zero-zero MVs but be cheaper.
+      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+        continue;
+    }
+
+#ifdef MODE_TEST_HIT_STATS
+    // TEST/DEBUG CODE
+    // Keep a rcord of the number of test hits at each size
+    cpi->mode_test_hits[bsize]++;
+#endif
+
+    if (ref_frame == INTRA_FRAME) {
+      int rate;
+      mbmi->tx_size = TX_4X4;
+      if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
+                                       &distortion_y, best_rd) >= best_rd)
+        continue;
+      rate2 += rate;
+      rate2 += intra_cost_penalty;
+      distortion2 += distortion_y;
+
+      if (rate_uv_intra[TX_4X4] == INT_MAX) {
+        choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
+                             &rate_uv_tokenonly[TX_4X4],
+                             &dist_uv[TX_4X4], &skip_uv[TX_4X4],
+                             &mode_uv[TX_4X4]);
+      }
+      rate2 += rate_uv_intra[TX_4X4];
+      rate_uv = rate_uv_tokenonly[TX_4X4];
+      distortion2 += dist_uv[TX_4X4];
+      distortion_uv = dist_uv[TX_4X4];
+      mbmi->uv_mode = mode_uv[TX_4X4];
+      tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      for (i = 0; i < TX_MODES; ++i)
+        tx_cache[i] = tx_cache[ONLY_4X4];
+    } else {
       int rate;
       int64_t distortion;
       int64_t this_rd_thresh;
@@ -3501,30 +4085,20 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
       int tmp_best_skippable = 0;
       int switchable_filter_index;
-      int_mv *second_ref = is_comp_pred ?
-          &mbmi->ref_mvs[second_ref_frame][0] : NULL;
-      union b_mode_info tmp_best_bmodes[16];
+      int_mv *second_ref = comp_pred ?
+                             &mbmi->ref_mvs[second_ref_frame][0] : NULL;
+      b_mode_info tmp_best_bmodes[16];
       MB_MODE_INFO tmp_best_mbmode;
-      PARTITION_INFO tmp_best_partition;
       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
       int pred_exists = 0;
       int uv_skippable;
-      if (is_comp_pred) {
-        if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
-          if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
-            continue;
-        if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
-          if (ref_frame != best_inter_ref_frame &&
-              second_ref_frame != best_inter_ref_frame)
-            continue;
-      }
 
       this_rd_thresh = (ref_frame == LAST_FRAME) ?
-          cpi->rd_threshes[bsize][THR_NEWMV] :
-          cpi->rd_threshes[bsize][THR_NEWA];
+          cpi->rd_thresh_sub8x8[segment_id][bsize][THR_LAST] :
+          cpi->rd_thresh_sub8x8[segment_id][bsize][THR_ALTR];
       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
-          cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
-      xd->this_mi->mbmi.tx_size = TX_4X4;
+          cpi->rd_thresh_sub8x8[segment_id][bsize][THR_GOLD] : this_rd_thresh;
+      xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
 
       cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
       if (cm->mcomp_filter_type != BILINEAR) {
@@ -3542,7 +4116,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
             mbmi->interp_filter = switchable_filter_index;
             vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
-            tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+            tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
                                                  &mbmi->ref_mvs[ref_frame][0],
                                                  second_ref,
                                                  best_yrd,
@@ -3578,9 +4152,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
               tmp_best_sse = total_sse;
               tmp_best_skippable = skippable;
               tmp_best_mbmode = *mbmi;
-              tmp_best_partition = *x->partition_info;
-              for (i = 0; i < 4; i++)
-                tmp_best_bmodes[i] = xd->this_mi->bmi[i];
+              for (i = 0; i < 4; i++) {
+                tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
+                x->zcoeff_blk[TX_4X4][i] = !xd->plane[0].eobs[i];
+              }
               pred_exists = 1;
               if (switchable_filter_index == 0 &&
                   cpi->sf.use_rd_breakout &&
@@ -3607,7 +4182,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
         // switchable list (bilinear, 6-tap) is indicated at the frame level
-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
                      &mbmi->ref_mvs[ref_frame][0],
                      second_ref,
                      best_yrd,
@@ -3630,9 +4205,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         distortion = tmp_best_distortion;
         skippable = tmp_best_skippable;
         *mbmi = tmp_best_mbmode;
-        *x->partition_info = tmp_best_partition;
         for (i = 0; i < 4; i++)
-          xd->this_mi->bmi[i] = tmp_best_bmodes[i];
+          xd->mi_8x8[0]->bmi[i] = tmp_best_bmodes[i];
       }
 
       rate2 += rate;
@@ -3642,12 +4216,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         rate2 += get_switchable_rate(x);
 
       if (!mode_excluded) {
-        if (is_comp_pred)
+        if (comp_pred)
           mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
         else
           mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
       }
-      compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred);
+      compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
 
       tmp_best_rdu = best_rd -
           MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
@@ -3658,7 +4232,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         // then dont bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                         BLOCK_8X8);
-        super_block_uvrd(cm, x, &rate_uv, &distortion_uv, &uv_skippable,
+        super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
                          &uv_sse, BLOCK_8X8, tmp_best_rdu);
         if (rate_uv == INT_MAX)
           continue;
@@ -3671,20 +4245,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         for (i = 0; i < TX_MODES; ++i)
           tx_cache[i] = tx_cache[ONLY_4X4];
       }
-    } else {
-      mbmi->mode = rd_mode_to_mode(this_mode);
-      compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
-      this_rd = handle_inter_mode(cpi, x, bsize,
-                                  tx_cache,
-                                  &rate2, &distortion2, &skippable,
-                                  &rate_y, &distortion_y,
-                                  &rate_uv, &distortion_uv,
-                                  &mode_excluded, &disable_skip,
-                                  &tmp_best_filter, frame_mv,
-                                  mi_row, mi_col,
-                                  single_newmv, &total_sse, best_rd);
-      if (this_rd == INT64_MAX)
-        continue;
     }
 
     if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
@@ -3708,25 +4268,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
                                                          SEG_LVL_SKIP);
 
-      if (skippable && bsize >= BLOCK_8X8) {
-        // Back out the coefficient coding costs
-        rate2 -= (rate_y + rate_uv);
-        // for best yrd calculation
-        rate_uv = 0;
-
-        if (mb_skip_allowed) {
-          int prob_skip_cost;
-
-          // Cost the skip mb case
-          vp9_prob skip_prob =
-            vp9_get_pred_prob_mbskip(cm, xd);
-
-          if (skip_prob) {
-            prob_skip_cost = vp9_cost_bit(skip_prob, 1);
-            rate2 += prob_skip_cost;
-          }
-        }
-      } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
+      if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
           // Add in the cost of the no skip flag.
@@ -3756,42 +4298,22 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
     }
 
-    // Keep record of best intra rd
-    if (xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
-        is_intra_mode(xd->this_mi->mbmi.mode) &&
-        this_rd < best_intra_rd) {
-      best_intra_rd = this_rd;
-      best_intra_mode = xd->this_mi->mbmi.mode;
-    }
     // Keep record of best inter rd with single reference
-    if (xd->this_mi->mbmi.ref_frame[0] > INTRA_FRAME &&
-        xd->this_mi->mbmi.ref_frame[1] == NONE &&
+    if (xd->mi_8x8[0]->mbmi.ref_frame[0] > INTRA_FRAME &&
+        xd->mi_8x8[0]->mbmi.ref_frame[1] == NONE &&
         !mode_excluded &&
         this_rd < best_inter_rd) {
       best_inter_rd = this_rd;
       best_inter_ref_frame = ref_frame;
-      // best_inter_mode = xd->this_mi->mbmi.mode;
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
     }
 
-    if (this_mode != RD_I4X4_PRED && this_mode != RD_SPLITMV) {
-      // Store the respective mode distortions for later use.
-      if (mode_distortions[this_mode] == -1
-          || distortion2 < mode_distortions[this_mode]) {
-        mode_distortions[this_mode] = distortion2;
-      }
-      if (frame_distortions[ref_frame] == -1
-          || distortion2 < frame_distortions[ref_frame]) {
-        frame_distortions[ref_frame] = distortion2;
-      }
-    }
-
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
       if (!mode_excluded) {
@@ -3810,15 +4332,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
-        best_partition = *x->partition_info;
+        vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+                   sizeof(uint8_t) * ctx->num_4x4_blk);
 
-        if (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV)
-          for (i = 0; i < 4; i++)
-            best_bmodes[i] = xd->this_mi->bmi[i];
+        for (i = 0; i < 4; i++)
+          best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
-        if (cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) {
+        if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+            (mode_index > MIN_EARLY_TERM_INDEX)) {
           const int qstep = xd->plane[0].dequant[1];
           // TODO(debargha): Enhance this by specializing for each mode_index
           int scale = 4;
@@ -3865,7 +4388,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         cm->mcomp_filter_type != BILINEAR) {
       int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
                               SWITCHABLE_FILTERS : cm->mcomp_filter_type];
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
         int64_t adj_rd;
         // In cases of poor prediction, filter_cache[] can contain really big
         // values, which actually are bigger than this_rd itself. This can
@@ -3882,8 +4405,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     /* keep record of best txfm size */
     if (bsize < BLOCK_32X32) {
       if (bsize < BLOCK_16X16) {
-        if (this_mode == RD_SPLITMV || this_mode == RD_I4X4_PRED)
-          tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
+        tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
       }
       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
@@ -3891,11 +4413,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (!mode_excluded && this_rd != INT64_MAX) {
       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
         int64_t adj_rd = INT64_MAX;
-        if (this_mode != RD_I4X4_PRED) {
+        if (ref_frame > INTRA_FRAME)
           adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
-        } else {
+        else
           adj_rd = this_rd;
-        }
 
         if (adj_rd < best_tx_rd[i])
           best_tx_rd[i] = adj_rd;
@@ -3915,39 +4436,20 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   // If we used an estimate for the uv intra rd in the loop above...
   if (cpi->sf.use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
-    if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
+    if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
       rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
                               &rate_uv_tokenonly[uv_tx_size],
                               &dist_uv[uv_tx_size],
                               &skip_uv[uv_tx_size],
-                              bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+                              BLOCK_8X8);
     }
   }
 
   // If we are using reference masking and the set mask flag is set then
   // create the reference frame mask.
   if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
-    cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame);
-
-  // Flag all modes that have a distortion thats > 2x the best we found at
-  // this level.
-  for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
-    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
-      continue;
-
-    if (mode_distortions[mode_index] > 2 * *returndistortion) {
-      ctx->modes_with_high_error |= (1 << mode_index);
-    }
-  }
-
-  // Flag all ref frames that have a distortion thats > 2x the best we found at
-  // this level.
-  for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (frame_distortions[ref_frame] > 2 * *returndistortion) {
-      ctx->frames_with_high_error |= (1 << ref_frame);
-    }
-  }
+    cpi->ref_frame_mask = ~(1 << vp9_ref_order[best_mode_index].ref_frame);
 
   if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
     *returnrate = INT_MAX;
@@ -3965,16 +4467,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   // best rd so far value and update rd_thresh_freq_fact based on the mode/size
   // combination that wins out.
   if (cpi->sf.adaptive_rd_thresh) {
-    for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
+    for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
       if (mode_index == best_mode_index) {
-        cpi->rd_thresh_freq_fact[bsize][mode_index] -=
-          (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
+        cpi->rd_thresh_freq_sub8x8[bsize][mode_index] -=
+          (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 3);
       } else {
-        cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
-        if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
-            (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) {
-          cpi->rd_thresh_freq_fact[bsize][mode_index] =
-            cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT;
+        cpi->rd_thresh_freq_sub8x8[bsize][mode_index] += RD_THRESH_INC;
+        if (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >
+            (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
+          cpi->rd_thresh_freq_sub8x8[bsize][mode_index] =
+            cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
         }
       }
     }
@@ -3983,27 +4485,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   // macroblock modes
   *mbmi = best_mbmode;
   x->skip |= best_skip2;
-  if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
-      best_mbmode.sb_type < BLOCK_8X8) {
+  if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
     for (i = 0; i < 4; i++)
-      xd->this_mi->bmi[i].as_mode = best_bmodes[i].as_mode;
-  }
-
-  if (best_mbmode.ref_frame[0] != INTRA_FRAME &&
-      best_mbmode.sb_type < BLOCK_8X8) {
-    for (i = 0; i < 4; i++)
-      xd->this_mi->bmi[i].as_mv[0].as_int =
-          best_bmodes[i].as_mv[0].as_int;
-
-    if (mbmi->ref_frame[1] > 0)
-      for (i = 0; i < 4; i++)
-        xd->this_mi->bmi[i].as_mv[1].as_int =
-            best_bmodes[i].as_mv[1].as_int;
-
-    *x->partition_info = best_partition;
+      xd->mi_8x8[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
+  } else {
+    for (i = 0; i < 4; ++i)
+      vpx_memcpy(&xd->mi_8x8[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
 
-    mbmi->mv[0].as_int = xd->this_mi->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int;
+    mbmi->mv[0].as_int = xd->mi_8x8[0]->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = xd->mi_8x8[0]->bmi[3].as_mv[1].as_int;
   }
 
   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
@@ -4014,7 +4504,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (!x->skip) {
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       if (best_filter_rd[i] == INT64_MAX)
         best_filter_diff[i] = 0;
       else
@@ -4023,7 +4513,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (cm->mcomp_filter_type == SWITCHABLE)
       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
   } else {
-    vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff));
+    vp9_zero(best_filter_diff);
   }
 
   if (!x->skip) {
@@ -4034,13 +4524,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         best_tx_diff[i] = best_rd - best_tx_rd[i];
     }
   } else {
-    vpx_memset(best_tx_diff, 0, sizeof(best_tx_diff));
+    vp9_zero(best_tx_diff);
   }
 
   set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
                     scale_factor);
   store_coding_context(x, ctx, best_mode_index,
-                       &best_partition,
                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
                                       mbmi->ref_frame[1]][0],
diff --git a/libvpx/vp9/encoder/vp9_rdopt.h b/libvpx/vp9/encoder/vp9_rdopt.h
index eba7df9..92fb235 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/libvpx/vp9/encoder/vp9_rdopt.h
@@ -12,10 +12,17 @@
 #ifndef VP9_ENCODER_VP9_RDOPT_H_
 #define VP9_ENCODER_VP9_RDOPT_H_
 
-#define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
+#define RDDIV_BITS          7
+
+#define RDCOST(RM, DM, R, D) \
+  (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
 #define QIDX_SKIP_THRESH     115
 
-void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
+struct TileInfo;
+
+int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex);
+
+void vp9_initialize_rd_consts(VP9_COMP *cpi);
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
@@ -24,13 +31,31 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                                  const struct TileInfo *const tile,
                                   int mi_row, int mi_col,
-                                  int *r, int64_t *d, BLOCK_SIZE bsize,
-                                  PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+                                  int *returnrate,
+                                  int64_t *returndistortion,
+                                  BLOCK_SIZE bsize,
+                                  PICK_MODE_CONTEXT *ctx,
+                                  int64_t best_rd_so_far);
+
+int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                      const struct TileInfo *const tile,
+                                      int mi_row, int mi_col,
+                                      int *returnrate,
+                                      int64_t *returndistortion,
+                                      BLOCK_SIZE bsize,
+                                      PICK_MODE_CONTEXT *ctx,
+                                      int64_t best_rd_so_far);
 
 void vp9_init_me_luts();
 
 void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
                             MB_PREDICTION_MODE mb, int_mv *mv);
 
+void vp9_get_entropy_contexts(TX_SIZE tx_size,
+    ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
+    const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
+    int num_4x4_w, int num_4x4_h);
+
 #endif  // VP9_ENCODER_VP9_RDOPT_H_
diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c
index 10655e8..24f011f 100644
--- a/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/libvpx/vp9/encoder/vp9_segmentation.c
@@ -117,7 +117,8 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) {
   return cost;
 }
 
-static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
+                       MODE_INFO **mi_8x8,
                        int *no_pred_segcounts,
                        int (*temporal_predictor_count)[2],
                        int *t_unpred_seg_counts,
@@ -129,9 +130,10 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  segment_id = mi_8x8[0]->mbmi.segment_id;
+  xd->mi_8x8 = mi_8x8;
+  segment_id = xd->mi_8x8[0]->mbmi.segment_id;
 
-  set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
   // Count the number of hits on each segment with no prediction
   no_pred_segcounts[segment_id]++;
@@ -156,7 +158,8 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
   }
 }
 
-static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile,
+                          MODE_INFO **mi_8x8,
                           int *no_pred_segcounts,
                           int (*temporal_predictor_count)[2],
                           int *t_unpred_seg_counts,
@@ -174,19 +177,20 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
   bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
 
   if (bw == bs && bh == bs) {
-    count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+    count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
                t_unpred_seg_counts, bs, bs, mi_row, mi_col);
   } else if (bw == bs && bh < bs) {
-    count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+    count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
                t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
-    count_segs(cpi, mi_8x8 + hbs * mis, no_pred_segcounts,
+    count_segs(cpi, tile, mi_8x8 + hbs * mis, no_pred_segcounts,
                temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
                mi_row + hbs, mi_col);
   } else if (bw < bs && bh == bs) {
-    count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+    count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
                t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
-    count_segs(cpi, mi_8x8 + hbs, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs);
+    count_segs(cpi, tile, mi_8x8 + hbs,
+               no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts,
+               hbs, bs, mi_row, mi_col + hbs);
   } else {
     const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
     int n;
@@ -197,7 +201,7 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
       const int mi_dc = hbs * (n & 1);
       const int mi_dr = hbs * (n >> 1);
 
-      count_segs_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc],
+      count_segs_sb(cpi, tile, &mi_8x8[mi_dr * mis + mi_dc],
                     no_pred_segcounts, temporal_predictor_count,
                     t_unpred_seg_counts,
                     mi_row + mi_dr, mi_col + mi_dc, subsize);
@@ -233,15 +237,18 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   // First of all generate stats regarding how well the last segment map
   // predicts this one
   for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
-    vp9_get_tile_col_offsets(cm, tile_col);
-    mi_ptr = cm->mi_grid_visible + cm->cur_tile_mi_col_start;
+    TileInfo tile;
+
+    vp9_tile_init(&tile, cm, 0, tile_col);
+    mi_ptr = cm->mi_grid_visible + tile.mi_col_start;
     for (mi_row = 0; mi_row < cm->mi_rows;
          mi_row += 8, mi_ptr += 8 * mis) {
       mi = mi_ptr;
-      for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+      for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
            mi_col += 8, mi += 8)
-        count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count,
-                      t_unpred_seg_counts, mi_row, mi_col, BLOCK_64X64);
+        count_segs_sb(cpi, &tile, mi, no_pred_segcounts,
+                      temporal_predictor_count, t_unpred_seg_counts,
+                      mi_row, mi_col, BLOCK_64X64);
     }
   }
 
@@ -251,13 +258,13 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree);
 
   // Key frames cannot use temporal prediction
-  if (cm->frame_type != KEY_FRAME) {
+  if (!frame_is_intra_only(cm)) {
     // Work out probability tree for coding those segments not
     // predicted using the temporal method and the cost.
     calc_segtree_probs(t_unpred_seg_counts, t_pred_tree);
     t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree);
 
-    // Add in the cost of the signalling for each prediction context
+    // Add in the cost of the signaling for each prediction context.
     for (i = 0; i < PREDICTION_PROBS; i++) {
       const int count0 = temporal_predictor_count[i][0];
       const int count1 = temporal_predictor_count[i][1];
diff --git a/libvpx/vp9/encoder/vp9_ssim.c b/libvpx/vp9/encoder/vp9_ssim.c
index c155516..a5f18e6 100644
--- a/libvpx/vp9/encoder/vp9_ssim.c
+++ b/libvpx/vp9/encoder/vp9_ssim.c
@@ -42,8 +42,8 @@ void vp9_ssim_parms_8x8_c(uint8_t *s, int sp, uint8_t *r, int rp,
   }
 }
 
-const static int64_t cc1 =  26634; // (64^2*(.01*255)^2
-const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
+static const int64_t cc1 =  26634;  // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
 
 static double similarity(unsigned long sum_s, unsigned long sum_r,
                          unsigned long sum_sq_s, unsigned long sum_sq_r,
diff --git a/libvpx/vp9/encoder/vp9_subexp.c b/libvpx/vp9/encoder/vp9_subexp.c
index 667b801..387fc90 100644
--- a/libvpx/vp9/encoder/vp9_subexp.c
+++ b/libvpx/vp9/encoder/vp9_subexp.c
@@ -221,7 +221,8 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
 }
 
 void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
-                               vp9_prob upd, unsigned int *ct) {
+                               const unsigned int ct[2]) {
+  const vp9_prob upd = DIFF_UPDATE_PROB;
   vp9_prob newp = get_binary_prob(ct[0], ct[1]);
   const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
                                                           upd);
diff --git a/libvpx/vp9/encoder/vp9_subexp.h b/libvpx/vp9/encoder/vp9_subexp.h
index 7acdaf6..521c777 100644
--- a/libvpx/vp9/encoder/vp9_subexp.h
+++ b/libvpx/vp9/encoder/vp9_subexp.h
@@ -19,7 +19,7 @@ void vp9_write_prob_diff_update(vp9_writer *w,
                                 vp9_prob newp, vp9_prob oldp);
 
 void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
-                               vp9_prob upd, unsigned int *ct);
+                               unsigned int *ct);
 
 int vp9_prob_diff_update_savings_search(const unsigned int *ct,
                                         vp9_prob oldp, vp9_prob *bestp,
diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c
index 63826ee..2cace03 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -29,7 +29,7 @@
 #include "vpx_ports/vpx_timer.h"
 
 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
-#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
+#define ALT_REF_SUBPEL_ENABLED 1  // dis/enable subpel in MC AltRef filtering
 
 static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                             uint8_t *y_mb_ptr,
@@ -38,14 +38,15 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                             int stride,
                                             int mv_row,
                                             int mv_col,
-                                            uint8_t *pred) {
+                                            uint8_t *pred,
+                                            struct scale_factors *scale) {
   const int which_mv = 0;
   MV mv = { mv_row, mv_col };
 
   vp9_build_inter_predictor(y_mb_ptr, stride,
                             &pred[0], 16,
                             &mv,
-                            &xd->scale_factor[which_mv],
+                            scale,
                             16, 16,
                             which_mv,
                             &xd->subpix, MV_PRECISION_Q3);
@@ -55,7 +56,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
   vp9_build_inter_predictor(u_mb_ptr, stride,
                             &pred[256], 8,
                             &mv,
-                            &xd->scale_factor[which_mv],
+                            scale,
                             8, 8,
                             which_mv,
                             &xd->subpix, MV_PRECISION_Q4);
@@ -63,7 +64,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
   vp9_build_inter_predictor(v_mb_ptr, stride,
                             &pred[320], 8,
                             &mv,
-                            &xd->scale_factor[which_mv],
+                            scale,
                             8, 8,
                             which_mv,
                             &xd->subpix, MV_PRECISION_Q4);
@@ -83,7 +84,6 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
 
   for (i = 0, k = 0; i < block_size; i++) {
     for (j = 0; j < block_size; j++, k++) {
-
       int src_byte = frame1[byte];
       int pixel_value = *frame2++;
 
@@ -151,13 +151,12 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
 
   /*cpi->sf.search_method == HEX*/
-  // TODO Check that the 16x16 vf & sdf are selected here
   // Ignore mv costing by sending NULL pointer instead of cost arrays
   ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0];
-  bestsme = vp9_hex_search(x, &best_ref_mv1_full,
+  bestsme = vp9_hex_search(x, &best_ref_mv1_full.as_mv,
                            step_param, sadpb, 1,
                            &cpi->fn_ptr[BLOCK_16X16],
-                           0, &best_ref_mv1, ref_mv);
+                           0, &best_ref_mv1.as_mv, &ref_mv->as_mv);
 
 #if ALT_REF_SUBPEL_ENABLED
   // Try sub-pixel MC?
@@ -166,8 +165,9 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
     int distortion;
     unsigned int sse;
     // Ignore mv costing by sending NULL pointer instead of cost array
-    bestsme = cpi->find_fractional_mv_step(x, ref_mv,
-                                           &best_ref_mv1,
+    bestsme = cpi->find_fractional_mv_step(x, &ref_mv->as_mv,
+                                           &best_ref_mv1.as_mv,
+                                           cpi->common.allow_high_precision_mv,
                                            x->errorperbit,
                                            &cpi->fn_ptr[BLOCK_16X16],
                                            0, cpi->sf.subpel_iters_per_step,
@@ -187,7 +187,8 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
 static void temporal_filter_iterate_c(VP9_COMP *cpi,
                                       int frame_count,
                                       int alt_ref_index,
-                                      int strength) {
+                                      int strength,
+                                      struct scale_factors *scale) {
   int byte;
   int frame;
   int mb_col, mb_row;
@@ -281,7 +282,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
            cpi->frames[frame]->y_stride,
            mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row,
            mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col,
-           predictor);
+           predictor, scale);
 
           // Apply the filter (YUV)
           vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
@@ -375,6 +376,9 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
   const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
                                - (num_frames_backward + 1);
 
+  struct scale_factors scale;
+  struct scale_factors_common scale_comm;
+
   switch (blur_type) {
     case 1:
       // Backward Blur
@@ -424,26 +428,22 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
 
 #ifdef DEBUGFWG
   // DEBUG FWG
-  printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"
-, max_frames
-, num_frames_backward
-, num_frames_forward
-, frames_to_blur
-, frames_to_blur_backward
-, frames_to_blur_forward
-, cpi->source_encode_index
-, cpi->last_alt_ref_sei
-, start_frame);
+  printf(
+      "max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d "
+      "start:%d",
+      max_frames, num_frames_backward, num_frames_forward, frames_to_blur,
+      frames_to_blur_backward, frames_to_blur_forward, cpi->source_encode_index,
+      cpi->last_alt_ref_sei, start_frame);
 #endif
 
   // Setup scaling factors. Scaling on each of the arnr frames is not supported
-  vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0],
-      cm->yv12_fb[cm->new_fb_idx].y_crop_width,
-      cm->yv12_fb[cm->new_fb_idx].y_crop_height,
+  vp9_setup_scale_factors_for_frame(&scale, &scale_comm,
+      get_frame_new_buffer(cm)->y_crop_width,
+      get_frame_new_buffer(cm)->y_crop_height,
       cm->width, cm->height);
 
   // Setup frame pointers, NULL indicates frame not included in filter
-  vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));
+  vp9_zero(cpi->frames);
   for (frame = 0; frame < frames_to_blur; frame++) {
     int which_buffer = start_frame - frame;
     struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
@@ -452,7 +452,7 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
   }
 
   temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward,
-                            strength);
+                            strength, &scale);
 }
 
 void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c
index 0c9bf9d..7d4676e 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/libvpx/vp9/encoder/vp9_tokenize.c
@@ -21,24 +21,12 @@
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_entropy.h"
 
-/* Global event counters used for accumulating statistics across several
-   compressions, then generating vp9_context.c = initial stats. */
-
-#ifdef ENTROPY_STATS
-vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
-extern vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
-#endif  /* ENTROPY_STATS */
-
-DECLARE_ALIGNED(16, extern const uint8_t,
-                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
 static int dct_value_cost[DCT_MAX_VALUE * 2];
 const int *vp9_dct_value_cost_ptr;
 
 static void fill_value_tokens() {
-
   TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
   const vp9_extra_bit *const e = vp9_extra_bits;
 
@@ -60,9 +48,9 @@ static void fill_value_tokens() {
 
         t[i].token = --j;
         eb |= (a - e[j].base_val) << 1;
-      } else
+      } else {
         t[i].token = a;
-
+      }
       t[i].extra = eb;
     }
 
@@ -81,9 +69,7 @@ static void fill_value_tokens() {
         cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */
         dct_value_cost[i + DCT_MAX_VALUE] = cost;
       }
-
     }
-
   } while (++i < DCT_MAX_VALUE);
 
   vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
@@ -95,6 +81,7 @@ struct tokenize_b_args {
   MACROBLOCKD *xd;
   TOKENEXTRA **tp;
   TX_SIZE tx_size;
+  uint8_t *token_cache;
 };
 
 static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -113,8 +100,9 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   VP9_COMP *cpi = args->cpi;
   MACROBLOCKD *xd = args->xd;
   TOKENEXTRA **tp = args->tp;
+  uint8_t *token_cache = args->token_cache;
   struct macroblockd_plane *pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   int pt; /* near block/prev token context index */
   int c = 0, rc = 0;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
@@ -127,21 +115,16 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
   vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
   const int ref = is_inter_block(mbmi);
-  uint8_t token_cache[1024];
-  const uint8_t *band_translate;
-  ENTROPY_CONTEXT *A, *L;
+  const uint8_t *const band_translate = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
 
-  A = pd->above_context + aoff;
-  L = pd->left_context + loff;
-
   assert((!type && !plane) || (type && plane));
 
-  pt = get_entropy_context(xd, tx_size, type, block, A, L,
-                           &scan, &band_translate);
-  nb = vp9_get_coef_neighbors_handle(scan);
+  pt = get_entropy_context(tx_size, pd->above_context + aoff,
+                                    pd->left_context + loff);
+  get_scan(xd, tx_size, type, block, &scan, &nb);
   c = 0;
   do {
     const int band = get_coef_band(band_translate, c);
@@ -210,12 +193,12 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
                      BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   TOKENEXTRA *t_backup = *t;
   const int mb_skip_context = vp9_get_pred_context_mbskip(xd);
   const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
-  struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size};
+  struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size, cpi->mb.token_cache};
 
   mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize);
   if (mbmi->skip_coeff) {
@@ -236,149 +219,6 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
   }
 }
 
-#ifdef ENTROPY_STATS
-void init_context_counters(void) {
-  FILE *f = fopen("context.bin", "rb");
-  if (!f) {
-    vp9_zero(context_counters);
-  } else {
-    fread(context_counters, sizeof(context_counters), 1, f);
-    fclose(f);
-  }
-
-  f = fopen("treeupdate.bin", "rb");
-  if (!f) {
-    vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));
-  } else {
-    fread(tree_update_hist, sizeof(tree_update_hist), 1, f);
-    fclose(f);
-  }
-}
-
-static void print_counter(FILE *f, vp9_coeff_accum *context_counters,
-                          int block_types, const char *header) {
-  int type, ref, band, pt, t;
-
-  fprintf(f, "static const vp9_coeff_count %s = {\n", header);
-
-#define Comma(X) (X ? "," : "")
-  type = 0;
-  do {
-    ref = 0;
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    do {
-      fprintf(f, "%s\n    { /* %s */", Comma(type), ref ? "Inter" : "Intra");
-      band = 0;
-      do {
-        fprintf(f, "%s\n      { /* Coeff Band %d */", Comma(band), band);
-        pt = 0;
-        do {
-          fprintf(f, "%s\n        {", Comma(pt));
-
-          t = 0;
-          do {
-            const int64_t x = context_counters[type][ref][band][pt][t];
-            const int y = (int) x;
-
-            assert(x == (int64_t) y);  /* no overflow handling yet */
-            fprintf(f, "%s %d", Comma(t), y);
-          } while (++t < 1 + MAX_ENTROPY_TOKENS);
-          fprintf(f, "}");
-        } while (++pt < PREV_COEF_CONTEXTS);
-        fprintf(f, "\n      }");
-      } while (++band < COEF_BANDS);
-      fprintf(f, "\n    }");
-    } while (++ref < REF_TYPES);
-    fprintf(f, "\n  }");
-  } while (++type < block_types);
-  fprintf(f, "\n};\n");
-}
-
-static void print_probs(FILE *f, vp9_coeff_accum *context_counters,
-                        int block_types, const char *header) {
-  int type, ref, band, pt, t;
-
-  fprintf(f, "static const vp9_coeff_probs %s = {", header);
-
-  type = 0;
-#define Newline(x, spaces) (x ? " " : "\n" spaces)
-  do {
-    fprintf(f, "%s%s{ /* block Type %d */",
-            Comma(type), Newline(type, "  "), type);
-    ref = 0;
-    do {
-      fprintf(f, "%s%s{ /* %s */",
-              Comma(band), Newline(band, "    "), ref ? "Inter" : "Intra");
-      band = 0;
-      do {
-        fprintf(f, "%s%s{ /* Coeff Band %d */",
-                Comma(band), Newline(band, "      "), band);
-        pt = 0;
-        do {
-          unsigned int branch_ct[ENTROPY_NODES][2];
-          unsigned int coef_counts[MAX_ENTROPY_TOKENS + 1];
-          vp9_prob coef_probs[ENTROPY_NODES];
-
-          if (pt >= 3 && band == 0)
-            break;
-          for (t = 0; t < MAX_ENTROPY_TOKENS + 1; ++t)
-            coef_counts[t] = context_counters[type][ref][band][pt][t];
-          vp9_tree_probs_from_distribution(vp9_coef_tree, coef_probs,
-                                           branch_ct, coef_counts, 0);
-          branch_ct[0][1] = coef_counts[MAX_ENTROPY_TOKENS] - branch_ct[0][0];
-          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          fprintf(f, "%s\n      {", Comma(pt));
-
-          t = 0;
-          do {
-            fprintf(f, "%s %3d", Comma(t), coef_probs[t]);
-          } while (++t < ENTROPY_NODES);
-
-          fprintf(f, " }");
-        } while (++pt < PREV_COEF_CONTEXTS);
-        fprintf(f, "\n      }");
-      } while (++band < COEF_BANDS);
-      fprintf(f, "\n    }");
-    } while (++ref < REF_TYPES);
-    fprintf(f, "\n  }");
-  } while (++type < block_types);
-  fprintf(f, "\n};\n");
-}
-
-void print_context_counters() {
-  FILE *f = fopen("vp9_context.c", "w");
-
-  fprintf(f, "#include \"vp9_entropy.h\"\n");
-  fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
-
-  /* print counts */
-  print_counter(f, context_counters[TX_4X4], BLOCK_TYPES,
-                "vp9_default_coef_counts_4x4[BLOCK_TYPES]");
-  print_counter(f, context_counters[TX_8X8], BLOCK_TYPES,
-                "vp9_default_coef_counts_8x8[BLOCK_TYPES]");
-  print_counter(f, context_counters[TX_16X16], BLOCK_TYPES,
-                "vp9_default_coef_counts_16x16[BLOCK_TYPES]");
-  print_counter(f, context_counters[TX_32X32], BLOCK_TYPES,
-                "vp9_default_coef_counts_32x32[BLOCK_TYPES]");
-
-  /* print coefficient probabilities */
-  print_probs(f, context_counters[TX_4X4], BLOCK_TYPES,
-              "default_coef_probs_4x4[BLOCK_TYPES]");
-  print_probs(f, context_counters[TX_8X8], BLOCK_TYPES,
-              "default_coef_probs_8x8[BLOCK_TYPES]");
-  print_probs(f, context_counters[TX_16X16], BLOCK_TYPES,
-              "default_coef_probs_16x16[BLOCK_TYPES]");
-  print_probs(f, context_counters[TX_32X32], BLOCK_TYPES,
-              "default_coef_probs_32x32[BLOCK_TYPES]");
-
-  fclose(f);
-
-  f = fopen("context.bin", "wb");
-  fwrite(context_counters, sizeof(context_counters), 1, f);
-  fclose(f);
-}
-#endif
-
 void vp9_tokenize_initialize() {
   fill_value_tokens();
 }
diff --git a/libvpx/vp9/encoder/vp9_tokenize.h b/libvpx/vp9/encoder/vp9_tokenize.h
index b78e100..e24e31b 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/libvpx/vp9/encoder/vp9_tokenize.h
@@ -28,9 +28,6 @@ typedef struct {
   uint8_t         skip_eob_node;
 } TOKENEXTRA;
 
-typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                               [MAX_ENTROPY_TOKENS + 1];
-
 int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize);
 int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                               int plane);
@@ -39,13 +36,6 @@ struct VP9_COMP;
 void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
                      BLOCK_SIZE bsize);
 
-#ifdef ENTROPY_STATS
-void init_context_counters();
-void print_context_counters();
-
-extern vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
-#endif
-
 extern const int *vp9_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
  *  improve cache locality, since it's needed for costing when the rest of the
diff --git a/libvpx/vp9/encoder/vp9_vaq.c b/libvpx/vp9/encoder/vp9_vaq.c
new file mode 100644
index 0000000..1f9cb87
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_vaq.c
@@ -0,0 +1,147 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vp9/encoder/vp9_vaq.h"
+
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+#define ENERGY_MIN (-3)
+#define ENERGY_MAX (3)
+#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN +  1)
+#define ENERGY_IN_BOUNDS(energy)\
+  assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
+
+static double q_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static double rdmult_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static int segment_id[MAX_SEGMENTS] = { 5, 3, 1, 0, 2, 4, 6, 7 };
+
+#define Q_RATIO(i) q_ratio[(i) - ENERGY_MIN]
+#define RDMULT_RATIO(i) rdmult_ratio[(i) - ENERGY_MIN]
+#define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
+
+DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0};
+
+unsigned int vp9_vaq_segment_id(int energy) {
+  ENERGY_IN_BOUNDS(energy);
+
+  return SEGMENT_ID(energy);
+}
+
+double vp9_vaq_rdmult_ratio(int energy) {
+  ENERGY_IN_BOUNDS(energy);
+
+  vp9_clear_system_state();  // __asm emms;
+
+  return RDMULT_RATIO(energy);
+}
+
+double vp9_vaq_inv_q_ratio(int energy) {
+  ENERGY_IN_BOUNDS(energy);
+
+  vp9_clear_system_state();  // __asm emms;
+
+  return Q_RATIO(-energy);
+}
+
+void vp9_vaq_init() {
+  int i;
+  double base_ratio;
+
+  assert(ENERGY_SPAN <= MAX_SEGMENTS);
+
+  vp9_clear_system_state();  // __asm emms;
+
+  base_ratio = 1.8;
+
+  for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
+    Q_RATIO(i) = pow(base_ratio, i/3.0);
+  }
+}
+
+void vp9_vaq_frame_setup(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  struct segmentation *seg = &cm->seg;
+  int base_q = vp9_convert_qindex_to_q(cm->base_qindex);
+  int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
+                                        cm->y_dc_delta_q);
+  int i;
+
+  vp9_enable_segmentation((VP9_PTR)cpi);
+  vp9_clearall_segfeatures(seg);
+
+  seg->abs_delta = SEGMENT_DELTADATA;
+
+  vp9_clear_system_state();  // __asm emms;
+
+  for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
+    int qindex_delta, segment_rdmult;
+
+    if (Q_RATIO(i) == 1) {
+      // No need to enable SEG_LVL_ALT_Q for this segment
+      RDMULT_RATIO(i) = 1;
+      continue;
+    }
+
+    qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i));
+    vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
+    vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
+
+    segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
+                                         cm->y_dc_delta_q);
+
+    RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
+  }
+}
+
+
+static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int var, sse;
+  int right_overflow = (xd->mb_to_right_edge < 0) ?
+      ((-xd->mb_to_right_edge) >> 3) : 0;
+  int bottom_overflow = (xd->mb_to_bottom_edge < 0) ?
+      ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+  if (right_overflow || bottom_overflow) {
+    const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;
+    const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;
+    int avg;
+    variance(x->plane[0].src.buf, x->plane[0].src.stride,
+             vp9_64_zeros, 0, bw, bh, &sse, &avg);
+    var = sse - (((int64_t)avg * avg) / (bw * bh));
+    return (256 * var) / (bw * bh);
+  } else {
+    var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                             x->plane[0].src.stride,
+                             vp9_64_zeros, 0, &sse);
+    return (256 * var) >> num_pels_log2_lookup[bs];
+  }
+}
+
+int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  double energy;
+  unsigned int var = block_variance(cpi, x, bs);
+
+  vp9_clear_system_state();  // __asm emms;
+
+  // if (var <= 1000)
+  //   return 0;
+
+  energy = 0.9*(logf(var + 1) - 10.0);
+  return clamp(round(energy), ENERGY_MIN, ENERGY_MAX);
+}
diff --git a/libvpx/vp9/encoder/vp9_vaq.h b/libvpx/vp9/encoder/vp9_vaq.h
new file mode 100644
index 0000000..dc18b22
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_vaq.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_CONFIG_VAQ_H_
+#define VP9_ENCODER_VP9_CONFIG_VAQ_H_
+
+#include "vp9/encoder/vp9_onyx_int.h"
+
+unsigned int vp9_vaq_segment_id(int energy);
+double vp9_vaq_rdmult_ratio(int energy);
+double vp9_vaq_inv_q_ratio(int energy);
+
+void vp9_vaq_init();
+void vp9_vaq_frame_setup(VP9_COMP *cpi);
+
+int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+
+#endif  // VP9_ENCODER_VP9_CONFIG_VAQ_H_
diff --git a/libvpx/vp9/encoder/vp9_variance.h b/libvpx/vp9/encoder/vp9_variance.h
index 6e686d6..2ded97c 100644
--- a/libvpx/vp9/encoder/vp9_variance.h
+++ b/libvpx/vp9/encoder/vp9_variance.h
@@ -14,6 +14,15 @@
 #include "vpx/vpx_integer.h"
 // #include "./vpx_config.h"
 
+void variance(const uint8_t *src_ptr,
+              int  source_stride,
+              const uint8_t *ref_ptr,
+              int  recon_stride,
+              int  w,
+              int  h,
+              unsigned int *sse,
+              int *sum);
+
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
                                     const uint8_t *ref_ptr,
@@ -67,12 +76,6 @@ typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
                                                    unsigned int *sse,
                                                    const uint8_t *second_pred);
 
-typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
-                                int rp, unsigned long *sum_s,
-                                unsigned long *sum_r, unsigned long *sum_sq_s,
-                                unsigned long *sum_sq_r,
-                                unsigned long *sum_sxr);
-
 typedef unsigned int (*vp9_getmbss_fn_t)(const short *);
 
 typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
diff --git a/libvpx/vp9/encoder/vp9_variance_c.c b/libvpx/vp9/encoder/vp9_variance_c.c
index 155ba8a..8bc3850 100644
--- a/libvpx/vp9/encoder/vp9_variance_c.c
+++ b/libvpx/vp9/encoder/vp9_variance_c.c
@@ -8,13 +8,150 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp9_rtcd.h"
 
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_filter.h"
-#include "vp9/common/vp9_subpelvar.h"
-#include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
-#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
+
+#include "vp9/encoder/vp9_variance.h"
+
+void variance(const uint8_t *src_ptr,
+              int  source_stride,
+              const uint8_t *ref_ptr,
+              int  recon_stride,
+              int  w,
+              int  h,
+              unsigned int *sse,
+              int *sum) {
+  int i, j;
+  int diff;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      diff = src_ptr[j] - ref_ptr[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    src_ptr += source_stride;
+    ref_ptr += recon_stride;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_first_pass
+ *
+ *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
+ *                  uint32_t src_pixels_per_line : Stride of input block.
+ *                  uint32_t pixel_step        : Offset between filter input
+ *                                               samples (see notes).
+ *                  uint32_t output_height     : Input block height.
+ *                  uint32_t output_width      : Input block width.
+ *                  int32_t  *vp9_filter       : Array of 2 bi-linear filter
+ *                                               taps.
+ *
+ *  OUTPUTS       : int32_t *output_ptr        : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement first-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=
+ *                  stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
+                                              uint16_t *output_ptr,
+                                              unsigned int src_pixels_per_line,
+                                              int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const int16_t *vp9_filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                          (int)src_ptr[pixel_step] * vp9_filter[1],
+                          FILTER_BITS);
+
+      src_ptr++;
+    }
+
+    // Next row...
+    src_ptr    += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_second_pass
+ *
+ *  INPUTS        : int32_t  *src_ptr          : Pointer to source block.
+ *                  uint32_t src_pixels_per_line : Stride of input block.
+ *                  uint32_t pixel_step        : Offset between filter input
+ *                                               samples (see notes).
+ *                  uint32_t output_height     : Input block height.
+ *                  uint32_t output_width      : Input block width.
+ *                  int32_t  *vp9_filter       : Array of 2 bi-linear filter
+ *                                               taps.
+ *
+ *  OUTPUTS       : uint16_t *output_ptr       : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement second-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Requires 32-bit input as produced by
+ *                  filter_block2d_bil_first_pass.
+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=
+ *                  stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
+                                               uint8_t *output_ptr,
+                                               unsigned int src_pixels_per_line,
+                                               unsigned int pixel_step,
+                                               unsigned int output_height,
+                                               unsigned int output_width,
+                                               const int16_t *vp9_filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                          (int)src_ptr[pixel_step] * vp9_filter[1],
+                          FILTER_BITS);
+      src_ptr++;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
 
 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
diff --git a/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
index 95ae266..2d59775 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -27,32 +27,14 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
   __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
   return _mm_unpacklo_epi64(buf0, buf1);
 }
-
-static INLINE __m128i k_cvtlo_epi16(__m128i a, __m128i mask16, __m128i kZero) {
-  // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
-  __m128i sign_bit = _mm_and_si128(a, mask16);
-  __m128i b = _mm_unpacklo_epi16(a, kZero);
-  sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
-  sign_bit = _mm_unpacklo_epi16(kZero, sign_bit);
-  return _mm_or_si128(sign_bit, b);
-}
-
-static INLINE __m128i k_cvthi_epi16(__m128i a, __m128i mask16, __m128i kZero) {
-  // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
-  __m128i sign_bit = _mm_and_si128(a, mask16);
-  __m128i b = _mm_unpackhi_epi16(a, kZero);
-  sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
-  sign_bit = _mm_unpackhi_epi16(kZero, sign_bit);
-  return _mm_or_si128(sign_bit, b);
-}
 #endif
 
-void FDCT32x32_2D(int16_t *input,
-                  int16_t *output_org, int pitch) {
+void FDCT32x32_2D(const int16_t *input,
+                  int16_t *output_org, int stride) {
   // Calculate pre-multiplied strides
-  const int str1 = pitch >> 1;
-  const int str2 = pitch;
-  const int str3 = pitch + str1;
+  const int str1 = stride;
+  const int str2 = 2 * stride;
+  const int str3 = 2 * stride + str1;
   // We need an intermediate buffer between passes.
   DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
   // Constants
@@ -111,13 +93,13 @@ void FDCT32x32_2D(int16_t *input,
       // Note: even though all the loads below are aligned, using the aligned
       //       intrinsic make the code slightly slower.
       if (0 == pass) {
-        int16_t *in  = &input[column_start];
+        const int16_t *in  = &input[column_start];
         // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
         // Note: the next four blocks could be in a loop. That would help the
         //       instruction cache but is actually slower.
         {
-          int16_t *ina =  in +  0 * str1;
-          int16_t *inb =  in + 31 * str1;
+          const int16_t *ina =  in +  0 * str1;
+          const int16_t *inb =  in + 31 * str1;
           __m128i *step1a = &step1[ 0];
           __m128i *step1b = &step1[31];
           const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
@@ -146,8 +128,8 @@ void FDCT32x32_2D(int16_t *input,
           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
         }
         {
-          int16_t *ina =  in +  4 * str1;
-          int16_t *inb =  in + 27 * str1;
+          const int16_t *ina =  in +  4 * str1;
+          const int16_t *inb =  in + 27 * str1;
           __m128i *step1a = &step1[ 4];
           __m128i *step1b = &step1[27];
           const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
@@ -176,8 +158,8 @@ void FDCT32x32_2D(int16_t *input,
           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
         }
         {
-          int16_t *ina =  in +  8 * str1;
-          int16_t *inb =  in + 23 * str1;
+          const int16_t *ina =  in +  8 * str1;
+          const int16_t *inb =  in + 23 * str1;
           __m128i *step1a = &step1[ 8];
           __m128i *step1b = &step1[23];
           const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
@@ -206,8 +188,8 @@ void FDCT32x32_2D(int16_t *input,
           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
         }
         {
-          int16_t *ina =  in + 12 * str1;
-          int16_t *inb =  in + 19 * str1;
+          const int16_t *ina =  in + 12 * str1;
+          const int16_t *inb =  in + 19 * str1;
           __m128i *step1a = &step1[12];
           __m128i *step1b = &step1[19];
           const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
@@ -1159,28 +1141,43 @@ void FDCT32x32_2D(int16_t *input,
       } else {
         __m128i lstep1[64], lstep2[64], lstep3[64];
         __m128i u[32], v[32], sign[16];
-        const __m128i mask16 = _mm_set1_epi32(0x80008000);
         const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
         // start using 32-bit operations
         // stage 3
         {
           // expanding to 32-bit length priori to addition operations
-          lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero);
-          lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero);
-          lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero);
-          lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero);
-          lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero);
-          lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero);
-          lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero);
-          lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero);
-          lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero);
-          lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero);
-          lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero);
-          lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero);
-          lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero);
-          lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero);
-          lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero);
-          lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero);
+          lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero);
+          lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero);
+          lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero);
+          lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero);
+          lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero);
+          lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero);
+          lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero);
+          lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero);
+          lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero);
+          lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero);
+          lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero);
+          lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero);
+          lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero);
+          lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero);
+          lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero);
+          lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero);
+          lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne);
+          lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne);
+          lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne);
+          lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne);
+          lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne);
+          lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne);
+          lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne);
+          lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne);
+          lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne);
+          lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne);
+          lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
+          lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
+          lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
+          lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
+          lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
+          lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
 
           lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
           lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
@@ -1231,42 +1228,75 @@ void FDCT32x32_2D(int16_t *input,
           lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
         }
         {
-          lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero);
-          lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero);
-          lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero);
-          lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero);
-          lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero);
-          lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero);
-          lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero);
-          lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero);
-          lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero);
-          lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero);
-          lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero);
-          lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero);
-          lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero);
-          lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero);
-          lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero);
-          lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero);
-
-          lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero);
-          lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero);
-          lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero);
-          lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero);
-          lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero);
-          lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero);
-          lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero);
-          lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero);
-          lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero);
-          lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero);
-          lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero);
-          lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero);
-          lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero);
-          lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero);
-          lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero);
-          lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero);
+          lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
+          lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
+          lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
+          lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
+          lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
+          lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
+          lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
+          lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
+          lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
+          lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
+          lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
+          lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
+          lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
+          lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
+          lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
+          lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
+          lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
+          lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
+          lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
+          lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
+          lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
+          lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
+          lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
+          lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
+          lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
+          lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
+          lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
+          lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
+          lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
+          lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
+          lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
+          lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
+
+          lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
+          lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
+          lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
+          lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
+          lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
+          lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
+          lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
+          lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
+          lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
+          lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
+          lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
+          lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
+          lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
+          lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
+          lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
+          lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
+          lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
+          lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
+          lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
+          lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
+          lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
+          lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
+          lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
+          lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
+          lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
+          lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
+          lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
+          lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
+          lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
+          lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
+          lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
+          lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
 
           lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
           lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+
           lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
           lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
           lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
@@ -1302,14 +1332,22 @@ void FDCT32x32_2D(int16_t *input,
         // stage 4
         {
           // expanding to 32-bit length priori to addition operations
-          lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero);
-          lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero);
-          lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero);
-          lstep2[19] = k_cvthi_epi16(step2[ 9], mask16, kZero);
-          lstep2[28] = k_cvtlo_epi16(step2[14], mask16, kZero);
-          lstep2[29] = k_cvthi_epi16(step2[14], mask16, kZero);
-          lstep2[30] = k_cvtlo_epi16(step2[15], mask16, kZero);
-          lstep2[31] = k_cvthi_epi16(step2[15], mask16, kZero);
+          lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero);
+          lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero);
+          lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero);
+          lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero);
+          lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
+          lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
+          lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
+          lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
+          lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
+          lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
+          lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
+          lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
+          lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
+          lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
+          lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
+          lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
 
           lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
           lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
@@ -1337,41 +1375,41 @@ void FDCT32x32_2D(int16_t *input,
           lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
         }
         {
-          // to be continued...
-          //
-          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
-          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
-          u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
-          u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
-          u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
-
-          // TODO(jingning): manually inline k_madd_epi32_ to further hide
-          // instruction latency.
-          v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
-          v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
-          v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
-          v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
-          v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
-          v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
-          v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
-          v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
-
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
-          lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+        // to be continued...
+        //
+        const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+        const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+        u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+        u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+        u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+        u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+        // TODO(jingning): manually inline k_madd_epi32_ to further hide
+        // instruction latency.
+        v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
+        v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
+        v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
+        v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
+        v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
+        v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
+        v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
+        v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
+
+        u[0] = k_packs_epi64(v[0], v[1]);
+        u[1] = k_packs_epi64(v[2], v[3]);
+        u[2] = k_packs_epi64(v[4], v[5]);
+        u[3] = k_packs_epi64(v[6], v[7]);
+
+        v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+        v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+        v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+        v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+        lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+        lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+        lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+        lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
         }
         {
           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
@@ -2647,4 +2685,4 @@ void FDCT32x32_2D(int16_t *input,
       }
     }
   }
-}
+}  // NOLINT
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index eb271fe..dc11501 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -12,14 +12,13 @@
 #include "vp9/common/vp9_idct.h"  // for cospi constants
 #include "vpx_ports/mem.h"
 
-void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
   // as the first pass results are transposed, we tranpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
-  const int stride = pitch >> 1;
   int pass;
   // Constants
   //    When we use them, in one case, they are all the same. In all others
@@ -112,12 +111,8 @@ void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) {
-  vp9_short_fdct4x4_sse2(input, output, pitch);
-  vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch);
-}
-
-static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+                                   int stride) {
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
   __m128i mask;
@@ -171,22 +166,21 @@ static INLINE void transpose_4x4(__m128i *res) {
 void fdct4_1d_sse2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
   __m128i u[4], v[4];
-  u[0] = _mm_add_epi16(in[0], in[3]);
-  u[1] = _mm_add_epi16(in[1], in[2]);
-  u[2] = _mm_sub_epi16(in[1], in[2]);
-  u[3] = _mm_sub_epi16(in[0], in[3]);
+  u[0]=_mm_unpacklo_epi16(in[0], in[1]);
+  u[1]=_mm_unpacklo_epi16(in[3], in[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
 
-  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
-  v[1] = _mm_unpacklo_epi16(u[2], u[3]);
   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
   u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
-  u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08);  // 1
-  u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24);  // 3
+  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
 
   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
@@ -249,7 +243,7 @@ void fadst4_1d_sse2(__m128i *in) {
   transpose_4x4(in);
 }
 
-void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
                            int stride, int tx_type) {
   __m128i in[4];
   load_buffer_4x4(input, in, stride);
@@ -277,8 +271,7 @@ void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
   write_buffer_4x4(output, in);
 }
 
-void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int stride = pitch >> 1;
+void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
   int pass;
   // Constants
   //    When we use them, in one case, they are all the same. In all others
@@ -535,15 +528,16 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
 }
 
 // load 8x8 array
-static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {
-  in[0]  = _mm_load_si128((__m128i *)(input + 0 * stride));
-  in[1]  = _mm_load_si128((__m128i *)(input + 1 * stride));
-  in[2]  = _mm_load_si128((__m128i *)(input + 2 * stride));
-  in[3]  = _mm_load_si128((__m128i *)(input + 3 * stride));
-  in[4]  = _mm_load_si128((__m128i *)(input + 4 * stride));
-  in[5]  = _mm_load_si128((__m128i *)(input + 5 * stride));
-  in[6]  = _mm_load_si128((__m128i *)(input + 6 * stride));
-  in[7]  = _mm_load_si128((__m128i *)(input + 7 * stride));
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+                                   int stride) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
 
   in[0] = _mm_slli_epi16(in[0], 2);
   in[1] = _mm_slli_epi16(in[1], 2);
@@ -1033,7 +1027,7 @@ void fadst8_1d_sse2(__m128i *in) {
   array_transpose_8x8(in, in);
 }
 
-void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
                            int stride, int tx_type) {
   __m128i in[8];
   load_buffer_8x8(input, in, stride);
@@ -1062,18 +1056,17 @@ void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
   write_buffer_8x8(output, in, 8);
 }
 
-void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
   // as the first pass results are transposed, we tranpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
-  const int stride = pitch >> 1;
   int pass;
   // We need an intermediate buffer between passes.
   DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
-  int16_t *in = input;
+  const int16_t *in = input;
   int16_t *out = intermediate;
   // Constants
   //    When we use them, in one case, they are all the same. In all others
@@ -1688,7 +1681,7 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0,
+static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
                                      __m128i *in1, int stride) {
   // load first 8 columns
   load_buffer_8x8(input, in0, stride);
@@ -2540,7 +2533,7 @@ void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
 }
 
-void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
                              int stride, int tx_type) {
   __m128i in0[16], in1[16];
   load_buffer_16x16(input, in0, in1, stride);
@@ -2572,13 +2565,13 @@ void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
   write_buffer_16x16(output, in0, in1, 16);
 }
 
-#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2
+#define FDCT32x32_2D vp9_fdct32x32_rd_sse2
 #define FDCT32x32_HIGH_PRECISION 0
 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
 #undef  FDCT32x32_2D
 #undef  FDCT32x32_HIGH_PRECISION
 
-#define FDCT32x32_2D vp9_short_fdct32x32_sse2
+#define FDCT32x32_2D vp9_fdct32x32_sse2
 #define FDCT32x32_HIGH_PRECISION 1
 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
 #undef  FDCT32x32_2D
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
index d141560..a3d0114 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
@@ -8,12 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp9/encoder/vp9_variance.h"
 #include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
-extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
+extern unsigned int vp9_get_mb_ss_mmx(const int16_t *src_ptr);
 extern unsigned int vp9_get8x8var_mmx
 (
   const unsigned char *src_ptr,
@@ -45,7 +45,6 @@ unsigned int vp9_variance4x4_mmx(
   vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
   *sse = var;
   return (var - (((unsigned int)avg * avg) >> 4));
-
 }
 
 unsigned int vp9_variance8x8_mmx(
@@ -61,7 +60,6 @@ unsigned int vp9_variance8x8_mmx(
   *sse = var;
 
   return (var - (((unsigned int)avg * avg) >> 6));
-
 }
 
 unsigned int vp9_mse16x16_mmx(
@@ -74,10 +72,14 @@ unsigned int vp9_mse16x16_mmx(
   int sum0, sum1, sum2, sum3;
 
 
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                    &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+                    &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+                    ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
+                    ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
 
   var = sse0 + sse1 + sse2 + sse3;
   *sse = var;
@@ -94,11 +96,14 @@ unsigned int vp9_variance16x16_mmx(
   unsigned int sse0, sse1, sse2, sse3, var;
   int sum0, sum1, sum2, sum3, avg;
 
-
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                    &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+                    &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+                    ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
+                    ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
 
   var = sse0 + sse1 + sse2 + sse3;
   avg = sum0 + sum1 + sum2 + sum3;
@@ -115,14 +120,15 @@ unsigned int vp9_variance16x8_mmx(
   unsigned int sse0, sse1, var;
   int sum0, sum1, avg;
 
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                    &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+                    &sse1, &sum1);
 
   var = sse0 + sse1;
   avg = sum0 + sum1;
   *sse = var;
   return (var - (((unsigned int)avg * avg) >> 7));
-
 }
 
 
@@ -135,13 +141,14 @@ unsigned int vp9_variance8x16_mmx(
   unsigned int sse0, sse1, var;
   int sum0, sum1, avg;
 
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                    &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+                    ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
 
   var = sse0 + sse1;
   avg = sum0 + sum1;
   *sse = var;
 
   return (var - (((unsigned int)avg * avg) >> 7));
-
 }
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
index cea934d..79e42c4 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 
 #include "vp9/encoder/vp9_variance.h"
 #include "vp9/common/vp9_pragmas.h"
@@ -26,7 +26,7 @@ extern unsigned int vp9_get4x4var_mmx
 
 unsigned int vp9_get_mb_ss_sse2
 (
-  const short *src_ptr
+  const int16_t *src_ptr
 );
 unsigned int vp9_get16x16var_sse2
 (
@@ -250,7 +250,6 @@ unsigned int vp9_mse16x16_sse2(
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
-
   unsigned int sse0;
   int sum0;
   vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
@@ -407,12 +406,12 @@ FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16,  8, 16, 4, 3, opt1,); \
-FN(8,  16,  8, 3, 4, opt1,); \
-FN(8,   8,  8, 3, 3, opt1,); \
-FN(8,   4,  8, 3, 2, opt1,); \
-FN(4,   8,  4, 2, 3, opt2,); \
-FN(4,   4,  4, 2, 2, opt2,)
+FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
+FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
+FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
+FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
+FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
+FN(4,   4,  4, 2, 2, opt2, (unsigned int))
 
 FNS(sse2, sse);
 FNS(ssse3, ssse3);
@@ -487,12 +486,12 @@ FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16,  8, 16, 4, 3, opt1,); \
-FN(8,  16,  8, 3, 4, opt1,); \
-FN(8,   8,  8, 3, 3, opt1,); \
-FN(8,   4,  8, 3, 2, opt1,); \
-FN(4,   8,  4, 2, 3, opt2,); \
-FN(4,   4,  4, 2, 2, opt2,)
+FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
+FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
+FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
+FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
+FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
+FN(4,   4,  4, 2, 2, opt2, (unsigned int))
 
 FNS(sse2, sse);
 FNS(ssse3, ssse3);
diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk
index 687fb48..0badb08 100644
--- a/libvpx/vp9/vp9_common.mk
+++ b/libvpx/vp9/vp9_common.mk
@@ -48,7 +48,6 @@ VP9_COMMON_SRCS-yes += common/vp9_reconintra.h
 VP9_COMMON_SRCS-yes += common/vp9_rtcd.c
 VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh
 VP9_COMMON_SRCS-yes += common/vp9_sadmxn.h
-VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h
 VP9_COMMON_SRCS-yes += common/vp9_scale.h
 VP9_COMMON_SRCS-yes += common/vp9_scale.c
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
@@ -69,13 +68,17 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c
 VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
 VP9_COMMON_SRCS-yes += common/vp9_common_data.c
 VP9_COMMON_SRCS-yes += common/vp9_common_data.h
+VP9_COMMON_SRCS-yes += common/vp9_scan.c
+VP9_COMMON_SRCS-yes += common/vp9_scan.h
 
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
@@ -88,11 +91,28 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
 endif
 
+# common (c)
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_common_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_avg_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_vert_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_avg_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans4_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans8_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_cols_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_dspr2.c
+
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
@@ -109,5 +129,6 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_save_reg_neon$(ASM)
 
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh))
diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c
index 48866d2..1942039 100644
--- a/libvpx/vp9/vp9_cx_iface.c
+++ b/libvpx/vp9/vp9_cx_iface.c
@@ -8,30 +8,30 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <stdlib.h>
+#include <string.h>
 
 #include "vpx/vpx_codec.h"
 #include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_version.h"
+#include "./vpx_version.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vpx/vp8cx.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/common/vp9_onyx.h"
 #include "vp9/vp9_iface_common.h"
-#include <stdlib.h>
-#include <string.h>
 
 struct vp9_extracfg {
   struct vpx_codec_pkt_list *pkt_list;
-  int                         cpu_used;                    /** available cpu percentage in 1/16*/
-  unsigned int                enable_auto_alt_ref;           /** if encoder decides to uses alternate reference frame */
+  int                         cpu_used;  /* available cpu percentage in 1/16 */
+  unsigned int                enable_auto_alt_ref;
   unsigned int                noise_sensitivity;
   unsigned int                Sharpness;
   unsigned int                static_thresh;
   unsigned int                tile_columns;
   unsigned int                tile_rows;
-  unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */
-  unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */
-  unsigned int                arnr_type;        /* alt_ref filter type */
+  unsigned int                arnr_max_frames;
+  unsigned int                arnr_strength;
+  unsigned int                arnr_type;
   unsigned int                experimental;
   vp8e_tuning                 tuning;
   unsigned int                cq_level;         /* constrained quality level */
@@ -48,7 +48,7 @@ struct extraconfig_map {
 static const struct extraconfig_map extracfg_map[] = {
   {
     0,
-    {
+    { // NOLINT
       NULL,
       0,                          /* cpu_used      */
       1,                          /* enable_auto_alt_ref */
@@ -85,11 +85,11 @@ struct vpx_codec_alg_priv {
   uint32_t                pending_frame_magnitude;
   vpx_image_t             preview_img;
   vp8_postproc_cfg_t      preview_ppcfg;
-  vpx_codec_pkt_list_decl(64) pkt_list;              // changed to accomendate the maximum number of lagged frames allowed
+  vpx_codec_pkt_list_decl(64) pkt_list;
   unsigned int                fixed_kf_cntr;
 };
 
-static const VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
+static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
   switch (frame) {
     case VP8_LAST_FRAME:
       return VP9_LAST_FLAG;
@@ -120,26 +120,26 @@ update_error_state(vpx_codec_alg_priv_t                 *ctx,
 #define ERROR(str) do {\
     ctx->base.err_detail = str;\
     return VPX_CODEC_INVALID_PARAM;\
-  } while(0)
+  } while (0)
 
-#define RANGE_CHECK(p,memb,lo,hi) do {\
-    if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
+#define RANGE_CHECK(p, memb, lo, hi) do {\
+    if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
       ERROR(#memb " out of range ["#lo".."#hi"]");\
-  } while(0)
+  } while (0)
 
-#define RANGE_CHECK_HI(p,memb,hi) do {\
-    if(!((p)->memb <= (hi))) \
+#define RANGE_CHECK_HI(p, memb, hi) do {\
+    if (!((p)->memb <= (hi))) \
       ERROR(#memb " out of range [.."#hi"]");\
-  } while(0)
+  } while (0)
 
-#define RANGE_CHECK_LO(p,memb,lo) do {\
-    if(!((p)->memb >= (lo))) \
+#define RANGE_CHECK_LO(p, memb, lo) do {\
+    if (!((p)->memb >= (lo))) \
       ERROR(#memb " out of range ["#lo"..]");\
-  } while(0)
+  } while (0)
 
-#define RANGE_CHECK_BOOL(p,memb) do {\
-    if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
-  } while(0)
+#define RANGE_CHECK_BOOL(p, memb) do {\
+    if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
+  } while (0)
 
 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
                                        const vpx_codec_enc_cfg_t *cfg,
@@ -247,7 +247,8 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   oxcf->width   = cfg.g_w;
   oxcf->height  = cfg.g_h;
   /* guess a frame rate if out of whack, use 30 */
-  oxcf->framerate             = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
+  oxcf->framerate = (double)(cfg.g_timebase.den)
+                    / (double)(cfg.g_timebase.num);
 
   if (oxcf->framerate > 180) {
     oxcf->framerate = 30;
@@ -255,7 +256,7 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
 
   switch (cfg.g_pass) {
     case VPX_RC_ONE_PASS:
-      oxcf->Mode = MODE_BESTQUALITY;
+      oxcf->Mode = MODE_GOODQUALITY;
       break;
     case VPX_RC_FIRST_PASS:
       oxcf->Mode = MODE_FIRSTPASS;
@@ -266,25 +267,25 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   }
 
   if (cfg.g_pass == VPX_RC_FIRST_PASS) {
-    oxcf->allow_lag              = 0;
-    oxcf->lag_in_frames           = 0;
+    oxcf->allow_lag     = 0;
+    oxcf->lag_in_frames = 0;
   } else {
-    oxcf->allow_lag              = (cfg.g_lag_in_frames) > 0;
-    oxcf->lag_in_frames           = cfg.g_lag_in_frames;
+    oxcf->allow_lag     = (cfg.g_lag_in_frames) > 0;
+    oxcf->lag_in_frames = cfg.g_lag_in_frames;
   }
 
   // VBR only supported for now.
   // CBR code has been deprectated for experimental phase.
   // CQ mode not yet tested
   oxcf->end_usage        = USAGE_LOCAL_FILE_PLAYBACK;
-  /*
   if (cfg.rc_end_usage == VPX_CQ)
     oxcf->end_usage      = USAGE_CONSTRAINED_QUALITY;
-    */
-  if (cfg.rc_end_usage == VPX_Q)
+  else if (cfg.rc_end_usage == VPX_Q)
     oxcf->end_usage      = USAGE_CONSTANT_QUALITY;
+  else if (cfg.rc_end_usage == VPX_CBR)
+    oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
 
-  oxcf->target_bandwidth        = cfg.rc_target_bitrate;
+  oxcf->target_bandwidth         = cfg.rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
 
   oxcf->best_allowed_q          = cfg.rc_min_quantizer;
@@ -299,7 +300,7 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   oxcf->starting_buffer_level   = cfg.rc_buf_initial_sz;
   oxcf->optimal_buffer_level    = cfg.rc_buf_optimal_sz;
 
-  oxcf->two_pass_vbrbias        = cfg.rc_2pass_vbr_bias_pct;
+  oxcf->two_pass_vbrbias         = cfg.rc_2pass_vbr_bias_pct;
   oxcf->two_pass_vbrmin_section  = cfg.rc_2pass_vbr_minsection_pct;
   oxcf->two_pass_vbrmax_section  = cfg.rc_2pass_vbr_maxsection_pct;
 
@@ -315,23 +316,23 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   oxcf->encode_breakout        =  vp8_cfg.static_thresh;
   oxcf->play_alternate         =  vp8_cfg.enable_auto_alt_ref;
   oxcf->noise_sensitivity      =  vp8_cfg.noise_sensitivity;
-  oxcf->Sharpness             =  vp8_cfg.Sharpness;
+  oxcf->Sharpness              =  vp8_cfg.Sharpness;
 
-  oxcf->two_pass_stats_in        =  cfg.rc_twopass_stats_in;
-  oxcf->output_pkt_list         =  vp8_cfg.pkt_list;
+  oxcf->two_pass_stats_in      =  cfg.rc_twopass_stats_in;
+  oxcf->output_pkt_list        =  vp8_cfg.pkt_list;
 
   oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
-  oxcf->arnr_strength =  vp8_cfg.arnr_strength;
-  oxcf->arnr_type =      vp8_cfg.arnr_type;
+  oxcf->arnr_strength   = vp8_cfg.arnr_strength;
+  oxcf->arnr_type       = vp8_cfg.arnr_type;
 
   oxcf->tuning = vp8_cfg.tuning;
 
   oxcf->tile_columns = vp8_cfg.tile_columns;
-  oxcf->tile_rows = vp8_cfg.tile_rows;
+  oxcf->tile_rows    = vp8_cfg.tile_rows;
 
   oxcf->lossless = vp8_cfg.lossless;
 
-  oxcf->error_resilient_mode = cfg.g_error_resilient;
+  oxcf->error_resilient_mode         = cfg.g_error_resilient;
   oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode;
 
   oxcf->ss_number_layers = cfg.ss_number_layers;
@@ -441,8 +442,6 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
       MAP(VP8E_SET_ARNR_TYPE,               xcfg.arnr_type);
       MAP(VP8E_SET_TUNING,                  xcfg.tuning);
       MAP(VP8E_SET_CQ_LEVEL,                xcfg.cq_level);
-      MAP(VP9E_SET_MAX_Q,                   ctx->cfg.rc_max_quantizer);
-      MAP(VP9E_SET_MIN_Q,                   ctx->cfg.rc_min_quantizer);
       MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT,   xcfg.rc_max_intra_bitrate_pct);
       MAP(VP9E_SET_LOSSLESS,                xcfg.lossless);
       MAP(VP9E_SET_FRAME_PARALLEL_DECODING, xcfg.frame_parallel_decoding_mode);
@@ -500,7 +499,7 @@ static vpx_codec_err_t vp9e_common_init(vpx_codec_ctx_t *ctx,
      */
     for (i = 0;
          extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
-         i++);
+         i++) {}
 
     priv->vp8_cfg = extracfg_map[i].cfg;
     priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
@@ -555,7 +554,6 @@ static vpx_codec_err_t vp9e_exp_init(vpx_codec_ctx_t *ctx,
 
 
 static vpx_codec_err_t vp9e_destroy(vpx_codec_alg_priv_t *ctx) {
-
   free(ctx->cx_data);
   vp9_remove_compressor(&ctx->cpi);
   free(ctx);
@@ -589,7 +587,8 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
 
 static int write_superframe_index(vpx_codec_alg_priv_t *ctx) {
   uint8_t marker = 0xc0;
-  int mag, mask, index_sz;
+  unsigned int mask;
+  int mag, index_sz;
 
   assert(ctx->pending_frame_count);
   assert(ctx->pending_frame_count <= 8);
@@ -713,8 +712,10 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t  *ctx,
     lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
     /* vp8 use 10,000,000 ticks/second as time stamp */
-    dst_time_stamp    = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
-    dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+    dst_time_stamp = pts * 10000000 * ctx->cfg.g_timebase.num
+                     / ctx->cfg.g_timebase.den;
+    dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num /
+                         ctx->cfg.g_timebase.den;
 
     if (img != NULL) {
       res = image2yuvconfig(img, &sd);
@@ -768,7 +769,7 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t  *ctx,
         }
 
         /* Add the frame packet to the list of returned packets. */
-        round = 1000000 * ctx->cfg.g_timebase.num / 2 - 1;
+        round = (vpx_codec_pts_t)1000000 * ctx->cfg.g_timebase.num / 2 - 1;
         delta = (dst_end_time_stamp - dst_time_stamp);
         pkt.kind = VPX_CODEC_CX_FRAME_PKT;
         pkt.data.frame.pts =
@@ -840,8 +841,6 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t  *ctx,
           cx_data += size;
           cx_data_sz -= size;
         }
-
-        // printf("timestamp: %lld, duration: %d\n", pkt->data.frame.pts, pkt->data.frame.duration);
       }
     }
   }
@@ -868,15 +867,14 @@ static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx,
     vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type),
                           &sd);
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
-
+  }
 }
 
 static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx,
                                            int ctr_id,
                                            va_list args) {
-
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
   if (data) {
@@ -887,8 +885,9 @@ static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx,
     vp9_copy_reference_enc(ctx->cpi,
                            ref_frame_to_vp9_reframe(frame->frame_type), &sd);
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 }
 
 static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
@@ -917,8 +916,9 @@ static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx,
   if (data) {
     ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 #else
   (void)ctx;
   (void)ctr_id;
@@ -929,7 +929,6 @@ static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx,
 
 
 static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) {
-
   YV12_BUFFER_CONFIG sd;
   vp9_ppflags_t flags = {0};
 
@@ -942,8 +941,9 @@ static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) {
   if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {
     yuvconfig2image(&ctx->preview_img, &sd, NULL);
     return &ctx->preview_img;
-  } else
+  } else {
     return NULL;
+  }
 }
 
 static vpx_codec_err_t vp9e_update_entropy(vpx_codec_alg_priv_t *ctx,
@@ -952,7 +952,6 @@ static vpx_codec_err_t vp9e_update_entropy(vpx_codec_alg_priv_t *ctx,
   int update = va_arg(args, int);
   vp9_update_entropy(ctx->cpi, update);
   return VPX_CODEC_OK;
-
 }
 
 static vpx_codec_err_t vp9e_update_reference(vpx_codec_alg_priv_t *ctx,
@@ -974,64 +973,30 @@ static vpx_codec_err_t vp9e_use_reference(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t vp9e_set_roi_map(vpx_codec_alg_priv_t *ctx,
                                         int ctr_id,
                                         va_list args) {
-  vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
-
-  if (data) {
-    vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
-
-    if (!vp9_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
-                        roi->delta_q, roi->delta_lf, roi->static_threshold))
-      return VPX_CODEC_OK;
-    else
-      return VPX_CODEC_INVALID_PARAM;
-  } else
-    return VPX_CODEC_INVALID_PARAM;
+  // TODO(yaowu): Need to re-implement and test for VP9.
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 
 static vpx_codec_err_t vp9e_set_activemap(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
-  vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
-
-  if (data) {
-
-    vpx_active_map_t *map = (vpx_active_map_t *)data;
-
-    if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
-      return VPX_CODEC_OK;
-    else
-      return VPX_CODEC_INVALID_PARAM;
-  } else
-    return VPX_CODEC_INVALID_PARAM;
+  // TODO(yaowu): Need to re-implement and test for VP9.
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
-
   vpx_scaling_mode_t *data =  va_arg(args, vpx_scaling_mode_t *);
 
   if (data) {
     int res;
     vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
-    res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
-                                scalemode.v_scaling_mode);
-
-    if (!res) {
-      return VPX_CODEC_OK;
-    } else
-      return VPX_CODEC_INVALID_PARAM;
-  } else
-    return VPX_CODEC_INVALID_PARAM;
-}
+    res = vp9_set_internal_size(ctx->cpi,
+                                (VPX_SCALING)scalemode.h_scaling_mode,
+                                (VPX_SCALING)scalemode.v_scaling_mode);
 
-static vpx_codec_err_t vp9e_set_width(vpx_codec_alg_priv_t *ctx, int ctr_id,
-                                      va_list args) {
-  unsigned int *data = va_arg(args, unsigned int *);
-  if (data) {
-    int res;
-    res = vp9_set_size_literal(ctx->cpi, *data, 0);
     if (!res) {
       return VPX_CODEC_OK;
     } else {
@@ -1042,50 +1007,40 @@ static vpx_codec_err_t vp9e_set_width(vpx_codec_alg_priv_t *ctx, int ctr_id,
   }
 }
 
-static vpx_codec_err_t vp9e_set_height(vpx_codec_alg_priv_t *ctx,
-                                       int ctr_id,
-                                       va_list args) {
-  unsigned int *data =  va_arg(args, unsigned int *);
+static vpx_codec_err_t vp9e_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id,
+                                    va_list args) {
+  int data = va_arg(args, int);
+  vp9_set_svc(ctx->cpi, data);
+  return VPX_CODEC_OK;
+}
 
-  if (data) {
-    int res;
-    res = vp9_set_size_literal(ctx->cpi, 0, *data);
+static vpx_codec_err_t vp9e_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
+                                               int ctr_id, va_list args) {
+  vpx_svc_parameters_t *data = va_arg(args, vpx_svc_parameters_t *);
+  VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
+  vpx_svc_parameters_t params;
 
-    if (!res) {
-      return VPX_CODEC_OK;
-    } else {
-      return VPX_CODEC_INVALID_PARAM;
-    }
-  } else {
+  if (data == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-}
-
-static vpx_codec_err_t vp9e_set_layer(vpx_codec_alg_priv_t *ctx,
-                                      int ctr_id,
-                                      va_list args) {
-  unsigned int *data =  va_arg(args, unsigned int *);
 
-  if (data) {
-    int res;
-    res = 0;
+  params = *(vpx_svc_parameters_t *)data;
 
-    res = vp9_switch_layer(ctx->cpi, *data);
+  cpi->current_layer = params.layer;
+  cpi->lst_fb_idx = params.lst_fb_idx;
+  cpi->gld_fb_idx = params.gld_fb_idx;
+  cpi->alt_fb_idx = params.alt_fb_idx;
 
-    if (!res) {
-      return VPX_CODEC_OK;
-    } else {
-      return VPX_CODEC_INVALID_PARAM;
-    }
-  } else {
+  if (vp9_set_size_literal(ctx->cpi, params.width, params.height) != 0) {
     return VPX_CODEC_INVALID_PARAM;
   }
-}
 
-static vpx_codec_err_t vp9e_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id,
-                                    va_list args) {
-  int data = va_arg(args, int);
-  vp9_set_svc(ctx->cpi, data);
+  ctx->cfg.rc_max_quantizer = params.max_quantizer;
+  ctx->cfg.rc_min_quantizer = params.min_quantizer;
+
+  set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
+  vp9_change_config(ctx->cpi, &ctx->oxcf);
+
   return VPX_CODEC_OK;
 }
 
@@ -1113,23 +1068,19 @@ static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = {
   {VP8E_SET_ARNR_TYPE,                set_param},
   {VP8E_SET_TUNING,                   set_param},
   {VP8E_SET_CQ_LEVEL,                 set_param},
-  {VP9E_SET_MAX_Q,                    set_param},
-  {VP9E_SET_MIN_Q,                    set_param},
   {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_param},
   {VP9E_SET_LOSSLESS,                 set_param},
   {VP9E_SET_FRAME_PARALLEL_DECODING,  set_param},
   {VP9_GET_REFERENCE,                 get_reference},
-  {VP9E_SET_WIDTH,                    vp9e_set_width},
-  {VP9E_SET_HEIGHT,                   vp9e_set_height},
-  {VP9E_SET_LAYER,                    vp9e_set_layer},
   {VP9E_SET_SVC,                      vp9e_set_svc},
+  {VP9E_SET_SVC_PARAMETERS,           vp9e_set_svc_parameters},
   { -1, NULL},
 };
 
 static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = {
   {
     0,
-    {
+    {  // NOLINT
       0,                  /* g_usage */
       0,                  /* g_threads */
       0,                  /* g_profile */
@@ -1198,13 +1149,13 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = {
   vp9e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
   NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
   NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
-  {
+  {  // NOLINT
     NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
     NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
     NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
     NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
   },
-  {
+  {  // NOLINT
     vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
     vp9e_encode,        /* vpx_codec_encode_fn_t      encode; */
     vp9e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
@@ -1227,13 +1178,13 @@ CODEC_INTERFACE(vpx_codec_vp9x_cx) = {
   vp9e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
   NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
   NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
-  {
+  {  // NOLINT
     NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
     NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
     NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
     NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
   },
-  {
+  {  // NOLINT
     vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
     vp9e_encode,        /* vpx_codec_encode_fn_t      encode; */
     vp9e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c
index 10b3238..5dacab4 100644
--- a/libvpx/vp9/vp9_dx_iface.c
+++ b/libvpx/vp9/vp9_dx_iface.c
@@ -14,7 +14,7 @@
 #include "vpx/vpx_decoder.h"
 #include "vpx/vp8dx.h"
 #include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_version.h"
+#include "./vpx_version.h"
 #include "vp9/decoder/vp9_onyxd.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
@@ -172,9 +172,9 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t         *data,
       rb.bit_offset += 1;  // show frame
       rb.bit_offset += 1;  // error resilient
 
-      if (vp9_rb_read_literal(&rb, 8) != SYNC_CODE_0 ||
-          vp9_rb_read_literal(&rb, 8) != SYNC_CODE_1 ||
-          vp9_rb_read_literal(&rb, 8) != SYNC_CODE_2) {
+      if (vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_0 ||
+          vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_1 ||
+          vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_2) {
         return VPX_CODEC_UNSUP_BITSTREAM;
       }
 
@@ -205,7 +205,6 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t         *data,
 
 static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t    *ctx,
                                   vpx_codec_stream_info_t *si) {
-
   unsigned int sz;
 
   if (si->sz >= sizeof(vp9_stream_info_t))
@@ -323,15 +322,20 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,
     vp9_ppflags_t flags = {0};
 
     if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) {
-      flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag
+      flags.post_proc_flag =
 #if CONFIG_POSTPROC_VISUALIZER
-
-                             | ((ctx->dbg_color_ref_frame_flag != 0) ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)
-                             | ((ctx->dbg_color_mb_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
-                             | ((ctx->dbg_color_b_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
-                             | ((ctx->dbg_display_mv_flag != 0) ? VP9D_DEBUG_DRAW_MV : 0)
+          ((ctx->dbg_color_ref_frame_flag != 0) ?
+              VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)
+          | ((ctx->dbg_color_mb_modes_flag != 0) ?
+              VP9D_DEBUG_CLR_BLK_MODES : 0)
+          | ((ctx->dbg_color_b_modes_flag != 0) ?
+              VP9D_DEBUG_CLR_BLK_MODES : 0)
+          | ((ctx->dbg_display_mv_flag != 0) ?
+              VP9D_DEBUG_DRAW_MV : 0)
+          |
 #endif
-;
+          ctx->postproc_cfg.post_proc_flag;
+
       flags.deblocking_level      = ctx->postproc_cfg.deblocking_level;
       flags.noise_level           = ctx->postproc_cfg.noise_level;
 #if CONFIG_POSTPROC_VISUALIZER
@@ -496,8 +500,9 @@ static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t      *ctx,
         mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags);
 
       res = VPX_CODEC_OK;
-    } else
+    } else {
       res = VPX_CODEC_LIST_END;
+    }
   } while (!mmap->sz && res != VPX_CODEC_LIST_END);
 
   return res;
@@ -542,7 +547,6 @@ static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t         *ctx,
 static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
                                      int ctr_id,
                                      va_list args) {
-
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
   if (data) {
@@ -553,15 +557,14 @@ static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
 
     return vp9_set_reference_dec(ctx->pbi,
                                  (VP9_REFFRAME)frame->frame_type, &sd);
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
-
+  }
 }
 
 static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx,
                                       int ctr_id,
                                       va_list args) {
-
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
   if (data) {
@@ -572,9 +575,9 @@ static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx,
 
     return vp9_copy_reference_dec(ctx->pbi,
                                   (VP9_REFFRAME)frame->frame_type, &sd);
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
-
+  }
 }
 
 static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
@@ -603,9 +606,9 @@ static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx,
     ctx->postproc_cfg_set = 1;
     ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
-
+  }
 #else
   return VPX_CODEC_INCAPABLE;
 #endif
@@ -642,25 +645,27 @@ static vpx_codec_err_t get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
     *update_info = pbi->refresh_frame_flags;
 
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 }
 
 
 static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
                                            int ctrl_id,
                                            va_list args) {
-
   int *corrupted = va_arg(args, int *);
 
   if (corrupted) {
     VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
-    *corrupted = pbi->common.frame_to_show->corrupted;
-
+    if (pbi)
+      *corrupted = pbi->common.frame_to_show->corrupted;
+    else
+      return VPX_CODEC_ERROR;
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
-
+  }
 }
 
 static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
@@ -699,13 +704,13 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = {
   ctf_maps,         /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
   vp9_xma_get_mmap, /* vpx_codec_get_mmap_fn_t   get_mmap; */
   vp9_xma_set_mmap, /* vpx_codec_set_mmap_fn_t   set_mmap; */
-  {
+  { // NOLINT
     vp9_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */
     vp9_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
     vp9_decode,       /* vpx_codec_decode_fn_t     decode; */
     vp9_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
   },
-  {
+  { // NOLINT
     /* encoder functions */
     NOT_IMPLEMENTED,
     NOT_IMPLEMENTED,
diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk
index 9fbf100..0993c6c 100644
--- a/libvpx/vp9/vp9cx.mk
+++ b/libvpx/vp9/vp9cx.mk
@@ -20,6 +20,7 @@ VP9_CX_SRCS-yes += vp9_cx_iface.c
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
 VP9_CX_SRCS-yes += encoder/vp9_boolhuff.c
 VP9_CX_SRCS-yes += encoder/vp9_dct.c
+VP9_CX_SRCS-yes += encoder/vp9_dct.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeintra.c
@@ -64,6 +65,8 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
 VP9_CX_SRCS-yes += encoder/vp9_variance_c.c
+VP9_CX_SRCS-yes += encoder/vp9_vaq.c
+VP9_CX_SRCS-yes += encoder/vp9_vaq.h
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
diff --git a/libvpx/vp9/vp9dx.mk b/libvpx/vp9/vp9dx.mk
index be3afe8..3a27cdd 100644
--- a/libvpx/vp9/vp9dx.mk
+++ b/libvpx/vp9/vp9dx.mk
@@ -32,12 +32,7 @@ VP9_DX_SRCS-yes += decoder/vp9_thread.c
 VP9_DX_SRCS-yes += decoder/vp9_thread.h
 VP9_DX_SRCS-yes += decoder/vp9_treereader.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c
-VP9_DX_SRCS-yes += decoder/vp9_idct_blk.c
-VP9_DX_SRCS-yes += decoder/vp9_idct_blk.h
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
 
 VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
-
-VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c
-VP9_DX_SRCS-$(HAVE_NEON) += decoder/arm/neon/vp9_add_constant_residual_neon$(ASM)