46 files changed, 3852 insertions, 2950 deletions
diff --git a/libvpx/vp9/encoder/vp9_aq_complexity.c b/libvpx/vp9/encoder/vp9_aq_complexity.c
new file mode 100644
index 0000000..47ad8d8
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_aq_complexity.c
@@ -0,0 +1,103 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_segmentation.h"
+
+static const double in_frame_q_adj_ratio[MAX_SEGMENTS] =
+  {1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
+
+  // Make SURE use of floating point in this function is safe.
+  vp9_clear_system_state();
+
+  if (cm->frame_type == KEY_FRAME ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    int segment;
+
+    // Clear down the segment map.
+    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+
+    // Clear down the complexity map used for rd.
+    vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
+
+    vp9_enable_segmentation(seg);
+    vp9_clearall_segfeatures(seg);
+
+    // Select delta coding method.
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    // Segment 0 "Q" feature is disabled so it defaults to the baseline Q.
+    vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
+
+    // Use some of the segments for in frame Q adjustment.
+    for (segment = 1; segment < 2; segment++) {
+      const int qindex_delta =
+          vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                     in_frame_q_adj_ratio[segment]);
+      vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+      vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+    }
+  }
+}
+
+// Select a segment for the current SB64
+void vp9_select_in_frame_q_segment(VP9_COMP *cpi,
+                                      int mi_row, int mi_col,
+                                      int output_enabled, int projected_rate) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+  const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+  const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  int complexity_metric = 64;
+  int x, y;
+
+  unsigned char segment;
+
+  if (!output_enabled) {
+    segment = 0;
+  } else {
+    // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+    // It is converted to bits * 256 units.
+    const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
+                            (bw * bh);
+
+    if (projected_rate < (target_rate / 4)) {
+      segment = 1;
+    } else {
+      segment = 0;
+    }
+
+    if (target_rate > 0) {
+      complexity_metric =
+        clamp((int)((projected_rate * 64) / target_rate), 16, 255);
+    }
+  }
+
+  // Fill in the entires in the segment map corresponding to this SB64.
+  for (y = 0; y < ymis; y++) {
+    for (x = 0; x < xmis; x++) {
+      cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
+      cpi->complexity_map[mi_offset + y * cm->mi_cols + x] =
+        (unsigned char)complexity_metric;
+    }
+  }
+}
diff --git a/libvpx/vp9/encoder/vp9_aq_complexity.h b/libvpx/vp9/encoder/vp9_aq_complexity.h
new file mode 100644
index 0000000..af031a4
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_aq_complexity.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+#define VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+
+// Select a segment for the current SB64.
+void vp9_select_in_frame_q_segment(struct VP9_COMP *cpi, int mi_row, int mi_col,
+                                   int output_enabled, int projected_rate);
+
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+void vp9_setup_in_frame_q_adj(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
diff --git a/libvpx/vp9/encoder/vp9_craq.c b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index 40437c7..7879091 100644
--- a/libvpx/vp9/encoder/vp9_craq.c
+++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -11,7 +11,7 @@
 #include <limits.h>
 #include <math.h>
 
-#include "vp9/encoder/vp9_craq.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 
 #include "vp9/common/vp9_seg_common.h"
 
@@ -19,19 +19,69 @@
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
 
+struct CYCLIC_REFRESH {
+  // Percentage of super-blocks per frame that are targeted as candidates
+  // for cyclic refresh.
+  int max_sbs_perframe;
+  // Maximum q-delta as percentage of base q.
+  int max_qdelta_perc;
+  // Block size below which we don't apply cyclic refresh.
+  BLOCK_SIZE min_block_size;
+  // Superblock starting index for cycling through the frame.
+  int sb_index;
+  // Controls how long a block will need to wait to be refreshed again.
+  int time_for_refresh;
+  // Actual number of (8x8) blocks that were applied delta-q (segment 1).
+  int num_seg_blocks;
+  // Actual encoding bits for segment 1.
+  int actual_seg_bits;
+  // RD mult. parameters for segment 1.
+  int rdmult;
+  // Cyclic refresh map.
+  signed char *map;
+  // Projected rate and distortion for the current superblock.
+  int64_t projected_rate_sb;
+  int64_t projected_dist_sb;
+  // Thresholds applied to projected rate/distortion of the superblock.
+  int64_t thresh_rate_sb;
+  int64_t thresh_dist_sb;
+};
+
+CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+  CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
+  if (cr == NULL)
+    return NULL;
+
+  cr->map = vpx_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+  if (cr->map == NULL) {
+    vpx_free(cr);
+    return NULL;
+  }
+
+  return cr;
+}
+
+void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
+  vpx_free(cr->map);
+  vpx_free(cr);
+}
 
 // Check if we should turn off cyclic refresh based on bitrate condition.
-static int apply_cyclic_refresh_bitrate(VP9_COMP *const cpi) {
+static int apply_cyclic_refresh_bitrate(const VP9_COMMON *cm,
+                                        const RATE_CONTROL *rc) {
   // Turn off cyclic refresh if bits available per frame is not sufficiently
   // larger than bit cost of segmentation. Segment map bit cost should scale
   // with number of seg blocks, so compare available bits to number of blocks.
   // Average bits available per frame = av_per_frame_bandwidth
   // Number of (8x8) blocks in frame = mi_rows * mi_cols;
-  float factor  = 0.5;
-  int number_blocks = cpi->common.mi_rows  * cpi->common.mi_cols;
+  const float factor  = 0.5;
+  const int number_blocks = cm->mi_rows  * cm->mi_cols;
   // The condition below corresponds to turning off at target bitrates:
   // ~24kbps for CIF, 72kbps for VGA (at 30fps).
-  if (cpi->rc.av_per_frame_bandwidth < factor * number_blocks)
+  // Also turn off at very small frame sizes, to avoid too large fraction of
+  // superblocks to be refreshed per frame. Threshold below is less than QCIF.
+  if (rc->av_per_frame_bandwidth < factor * number_blocks ||
+      number_blocks / 64 < 5)
     return 0;
   else
     return 1;
@@ -41,11 +91,9 @@ static int apply_cyclic_refresh_bitrate(VP9_COMP *const cpi) {
 // (lower-qp coding). Decision can be based on various factors, such as
 // size of the coding block (i.e., below min_block size rejected), coding
 // mode, and rate/distortion.
-static int candidate_refresh_aq(VP9_COMP *const cpi,
-                                MODE_INFO *const mi,
-                                int bsize,
-                                int use_rd) {
-  CYCLIC_REFRESH *const cr = &cpi->cyclic_refresh;
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
+                                const MB_MODE_INFO *mbmi,
+                                BLOCK_SIZE bsize, int use_rd) {
   if (use_rd) {
     // If projected rate is below the thresh_rate (well below target,
     // so undershoot expected), accept it for lower-qp coding.
@@ -56,18 +104,18 @@ static int candidate_refresh_aq(VP9_COMP *const cpi,
     // 2) mode is non-zero mv and projected distortion is above thresh_dist
     // 3) mode is an intra-mode (we may want to allow some of this under
     // another thresh_dist)
-    else if ((bsize < cr->min_block_size) ||
-        (mi->mbmi.mv[0].as_int != 0 &&
-            cr->projected_dist_sb > cr->thresh_dist_sb) ||
-            !is_inter_block(&mi->mbmi))
+    else if (bsize < cr->min_block_size ||
+             (mbmi->mv[0].as_int != 0 &&
+              cr->projected_dist_sb > cr->thresh_dist_sb) ||
+             !is_inter_block(mbmi))
       return 0;
     else
       return 1;
   } else {
     // Rate/distortion not used for update.
-    if ((bsize < cr->min_block_size) ||
-      (mi->mbmi.mv[0].as_int != 0) ||
-      !is_inter_block(&mi->mbmi))
+    if (bsize < cr->min_block_size ||
+        mbmi->mv[0].as_int != 0 ||
+        !is_inter_block(mbmi))
       return 0;
     else
       return 1;
@@ -77,33 +125,31 @@ static int candidate_refresh_aq(VP9_COMP *const cpi,
 // Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
 // check if we should reset the segment_id, and update the cyclic_refresh map
 // and segmentation map.
-void vp9_update_segment_aq(VP9_COMP *const cpi,
-                           MODE_INFO *const mi,
-                           int mi_row,
-                           int mi_col,
-                           int bsize,
-                           int use_rd) {
-  CYCLIC_REFRESH *const cr = &cpi->cyclic_refresh;
-  VP9_COMMON *const cm = &cpi->common;
+void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
+                                       MB_MODE_INFO *const mbmi,
+                                       int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize, int use_rd) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int bw = num_8x8_blocks_wide_lookup[bsize];
   const int bh = num_8x8_blocks_high_lookup[bsize];
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
   const int ymis = MIN(cm->mi_rows - mi_row, bh);
   const int block_index = mi_row * cm->mi_cols + mi_col;
+  const int refresh_this_block = candidate_refresh_aq(cr, mbmi, bsize, use_rd);
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
   int x = 0; int y = 0;
-  int current_segment = mi->mbmi.segment_id;
-  int refresh_this_block = candidate_refresh_aq(cpi, mi, bsize, use_rd);
+
   // Check if we should reset the segment_id for this block.
-  if (current_segment && !refresh_this_block)
-    mi->mbmi.segment_id = 0;
+  if (mbmi->segment_id > 0 && !refresh_this_block)
+    mbmi->segment_id = 0;
 
   // Update the cyclic refresh map, to be used for setting segmentation map
   // for the next frame. If the block  will be refreshed this frame, mark it
   // as clean. The magnitude of the -ve influences how long before we consider
   // it for refresh again.
-  if (mi->mbmi.segment_id == 1) {
+  if (mbmi->segment_id == 1) {
     new_map_value = -cr->time_for_refresh;
   } else if (refresh_this_block) {
     // Else if it is accepted as candidate for refresh, and has not already
@@ -121,54 +167,54 @@ void vp9_update_segment_aq(VP9_COMP *const cpi,
     for (x = 0; x < xmis; x++) {
       cr->map[block_index + y * cm->mi_cols + x] = new_map_value;
       cpi->segmentation_map[block_index + y * cm->mi_cols + x] =
-          mi->mbmi.segment_id;
+          mbmi->segment_id;
     }
   // Keep track of actual number (in units of 8x8) of blocks in segment 1 used
   // for encoding this frame.
-  if (mi->mbmi.segment_id)
+  if (mbmi->segment_id)
     cr->num_seg_blocks += xmis * ymis;
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
-void vp9_setup_cyclic_refresh_aq(VP9_COMP *const cpi) {
+void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  CYCLIC_REFRESH *const cr = &cpi->cyclic_refresh;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
-  unsigned char *seg_map = cpi->segmentation_map;
-  int apply_cyclic_refresh  = apply_cyclic_refresh_bitrate(cpi);
+  unsigned char *const seg_map = cpi->segmentation_map;
+  const int apply_cyclic_refresh  = apply_cyclic_refresh_bitrate(cm, rc);
   // Don't apply refresh on key frame or enhancement layer frames.
   if (!apply_cyclic_refresh ||
-      (cpi->common.frame_type == KEY_FRAME) ||
+      (cm->frame_type == KEY_FRAME) ||
       (cpi->svc.temporal_layer_id > 0)) {
     // Set segmentation map to 0 and disable.
     vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
     vp9_disable_segmentation(&cm->seg);
-    if (cpi->common.frame_type == KEY_FRAME)
-      cr->mb_index = 0;
+    if (cm->frame_type == KEY_FRAME)
+      cr->sb_index = 0;
     return;
   } else {
     int qindex_delta = 0;
-    int mbs_in_frame = cm->mi_rows * cm->mi_cols;
-    int i, x, y, block_count, bl_index, bl_index2;
-    int sum_map, new_value, mi_row, mi_col, xmis, ymis, qindex2;
+    int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+    int xmis, ymis, x, y, qindex2;
 
     // Rate target ratio to set q delta.
-    float rate_ratio_qdelta = 2.0;
+    const float rate_ratio_qdelta = 2.0;
     vp9_clear_system_state();
     // Some of these parameters may be set via codec-control function later.
-    cr->max_mbs_perframe = 10;
+    cr->max_sbs_perframe = 10;
     cr->max_qdelta_perc = 50;
-    cr->min_block_size = BLOCK_16X16;
+    cr->min_block_size = BLOCK_8X8;
     cr->time_for_refresh = 1;
     // Set rate threshold to some fraction of target (and scaled by 256).
-    cr->thresh_rate_sb = (cpi->rc.sb64_target_rate * 256) >> 2;
+    cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 2;
     // Distortion threshold, quadratic in Q, scale factor to be adjusted.
     cr->thresh_dist_sb = 8 * (int)(vp9_convert_qindex_to_q(cm->base_qindex) *
         vp9_convert_qindex_to_q(cm->base_qindex));
     if (cpi->sf.use_nonrd_pick_mode) {
       // May want to be more conservative with thresholds in non-rd mode for now
       // as rate/distortion are derived from model based on prediction residual.
-      cr->thresh_rate_sb = (cpi->rc.sb64_target_rate * 256) >> 3;
+      cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 3;
       cr->thresh_dist_sb = 4 * (int)(vp9_convert_qindex_to_q(cm->base_qindex) *
           vp9_convert_qindex_to_q(cm->base_qindex));
     }
@@ -195,73 +241,84 @@ void vp9_setup_cyclic_refresh_aq(VP9_COMP *const cpi) {
     vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
 
     // Set the q delta for segment 1.
-    qindex_delta = vp9_compute_qdelta_by_rate(cpi,
+    qindex_delta = vp9_compute_qdelta_by_rate(rc, cm->frame_type,
                                               cm->base_qindex,
                                               rate_ratio_qdelta);
     // TODO(marpan): Incorporate the actual-vs-target rate over/undershoot from
     // previous encoded frame.
-    if ((-qindex_delta) > cr->max_qdelta_perc * cm->base_qindex / 100) {
+    if (-qindex_delta > cr->max_qdelta_perc * cm->base_qindex / 100)
       qindex_delta = -cr->max_qdelta_perc * cm->base_qindex / 100;
-    }
 
     // Compute rd-mult for segment 1.
     qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
     cr->rdmult = vp9_compute_rd_mult(cpi, qindex2);
 
     vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qindex_delta);
-    // Number of target macroblocks to get the q delta (segment 1).
-    block_count = cr->max_mbs_perframe * mbs_in_frame / 100;
-    // Set the segmentation map: cycle through the macroblocks, starting at
+
+    sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+    sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+    sbs_in_frame = sb_cols * sb_rows;
+    // Number of target superblocks to get the q delta (segment 1).
+    block_count = cr->max_sbs_perframe * sbs_in_frame / 100;
+    // Set the segmentation map: cycle through the superblocks, starting at
     // cr->mb_index, and stopping when either block_count blocks have been found
     // to be refreshed, or we have passed through whole frame.
-    // Note the setting of seg_map below is done in two steps (one over 8x8)
-    // and then another over SB, in order to keep the value constant over SB.
-    // TODO(marpan): Do this in one pass in SB order.
-    assert(cr->mb_index < mbs_in_frame);
-    i = cr->mb_index;
+    assert(cr->sb_index < sbs_in_frame);
+    i = cr->sb_index;
     do {
-      // If the macroblock is as a candidate for clean up then mark it
-      // for possible boost/refresh (segment 1). The segment id may get reset to
-      // 0 later if the macroblock gets coded anything other than ZEROMV.
-      if (cr->map[i] == 0) {
-        seg_map[i] = 1;
-        block_count--;
-      } else if (cr->map[i] < 0) {
-        cr->map[i]++;
+      int sum_map = 0;
+      // Get the mi_row/mi_col corresponding to superblock index i.
+      int sb_row_index = (i / sb_cols);
+      int sb_col_index = i - sb_row_index * sb_cols;
+      int mi_row = sb_row_index * MI_BLOCK_SIZE;
+      int mi_col = sb_col_index * MI_BLOCK_SIZE;
+      assert(mi_row >= 0 && mi_row < cm->mi_rows);
+      assert(mi_col >= 0 && mi_col < cm->mi_cols);
+      bl_index = mi_row * cm->mi_cols + mi_col;
+      // Loop through all 8x8 blocks in superblock and update map.
+      xmis = MIN(cm->mi_cols - mi_col,
+                 num_8x8_blocks_wide_lookup[BLOCK_64X64]);
+      ymis = MIN(cm->mi_rows - mi_row,
+                 num_8x8_blocks_high_lookup[BLOCK_64X64]);
+      for (y = 0; y < ymis; y++) {
+        for (x = 0; x < xmis; x++) {
+          const int bl_index2 = bl_index + y * cm->mi_cols + x;
+          // If the block is as a candidate for clean up then mark it
+          // for possible boost/refresh (segment 1). The segment id may get
+          // reset to 0 later if block gets coded anything other than ZEROMV.
+          if (cr->map[bl_index2] == 0) {
+            seg_map[bl_index2] = 1;
+            sum_map++;
+          } else if (cr->map[bl_index2] < 0) {
+            cr->map[bl_index2]++;
+          }
+        }
+      }
+      // Enforce constant segment over superblock.
+      // If segment is partial over superblock, reset to either all 1 or 0.
+      if (sum_map > 0 && sum_map < xmis * ymis) {
+        const int new_value = (sum_map >= xmis * ymis / 2);
+        for (y = 0; y < ymis; y++)
+          for (x = 0; x < xmis; x++)
+            seg_map[bl_index + y * cm->mi_cols + x] = new_value;
       }
       i++;
-      if (i == mbs_in_frame) {
+      if (i == sbs_in_frame) {
         i = 0;
       }
-    } while (block_count && i != cr->mb_index);
-    cr->mb_index = i;
-    // Enforce constant segment map over superblock.
-    for (mi_row = 0; mi_row < cm->mi_rows; mi_row +=  MI_BLOCK_SIZE)
-      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
-        bl_index = mi_row * cm->mi_cols + mi_col;
-        xmis = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-        ymis = num_8x8_blocks_high_lookup[BLOCK_64X64];
-        xmis = MIN(cm->mi_cols - mi_col, xmis);
-        ymis = MIN(cm->mi_rows - mi_row, ymis);
-        sum_map = 0;
-        for (y = 0; y < ymis; y++)
-          for (x = 0; x < xmis; x++) {
-            bl_index2 = bl_index + y * cm->mi_cols + x;
-               sum_map += seg_map[bl_index2];
-          }
-        new_value = 0;
-        // If segment is partial over superblock, reset.
-        if (sum_map > 0 && sum_map < xmis * ymis) {
-          if (sum_map < xmis * ymis / 2)
-            new_value = 0;
-          else
-            new_value = 1;
-          for (y = 0; y < ymis; y++)
-            for (x = 0; x < xmis; x++) {
-              bl_index2 = bl_index + y * cm->mi_cols + x;
-              seg_map[bl_index2] = new_value;
-            }
-        }
-      }
+      if (sum_map >= xmis * ymis /2)
+        block_count--;
+    } while (block_count && i != cr->sb_index);
+    cr->sb_index = i;
   }
 }
+
+void vp9_cyclic_refresh_set_rate_and_dist_sb(CYCLIC_REFRESH *cr,
+                                             int64_t rate_sb, int64_t dist_sb) {
+  cr->projected_rate_sb = rate_sb;
+  cr->projected_dist_sb = dist_sb;
+}
+
+int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
+  return cr->rdmult;
+}
diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
new file mode 100644
index 0000000..f556d65
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+#define VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+
+#include "vp9/common/vp9_blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+
+struct CYCLIC_REFRESH;
+typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
+
+CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols);
+
+void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi,
+                                       MB_MODE_INFO *const mbmi,
+                                       int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize, int use_rd);
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void vp9_cyclic_refresh_setup(struct VP9_COMP *const cpi);
+
+void vp9_cyclic_refresh_set_rate_and_dist_sb(CYCLIC_REFRESH *cr,
+                                             int64_t rate_sb, int64_t dist_sb);
+
+int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
diff --git a/libvpx/vp9/encoder/vp9_vaq.c b/libvpx/vp9/encoder/vp9_aq_variance.c
index c71c171..ae2a163 100644
--- a/libvpx/vp9/encoder/vp9_vaq.c
+++ b/libvpx/vp9/encoder/vp9_aq_variance.c
@@ -10,7 +10,7 @@
 
 #include <math.h>
 
-#include "vp9/encoder/vp9_vaq.h"
+#include "vp9/encoder/vp9_aq_variance.h"
 
 #include "vp9/common/vp9_seg_common.h"
 
@@ -99,7 +99,7 @@ void vp9_vaq_frame_setup(VP9_COMP *cpi) {
         continue;
       }
 
-      qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i));
+      qindex_delta = vp9_compute_qdelta(&cpi->rc, base_q, base_q * Q_RATIO(i));
       vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
       vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
 
diff --git a/libvpx/vp9/encoder/vp9_vaq.h b/libvpx/vp9/encoder/vp9_aq_variance.h
index c73114a..381fe50 100644
--- a/libvpx/vp9/encoder/vp9_vaq.h
+++ b/libvpx/vp9/encoder/vp9_aq_variance.h
@@ -9,8 +9,8 @@
  */
 
 
-#ifndef VP9_ENCODER_VP9_VAQ_H_
-#define VP9_ENCODER_VP9_VAQ_H_
+#ifndef VP9_ENCODER_VP9_AQ_VARIANCE_H_
+#define VP9_ENCODER_VP9_AQ_VARIANCE_H_
 
 #include "vp9/encoder/vp9_onyx_int.h"
 
@@ -31,4 +31,4 @@ int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_VAQ_H_
+#endif  // VP9_ENCODER_VP9_AQ_VARIANCE_H_
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index 1b4a6cc..8d2afb9 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -192,7 +192,7 @@ static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
 static void write_ref_frames(const VP9_COMP *cpi, vp9_writer *w) {
   const VP9_COMMON *const cm = &cpi->common;
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int is_compound = has_second_ref(mbmi);
   const int segment_id = mbmi->segment_id;
 
@@ -336,7 +336,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   const struct segmentation *const seg = &cm->seg;
   const MODE_INFO *const mi = mi_8x8[0];
-  const MODE_INFO *const above_mi = mi_8x8[-xd->mode_info_stride];
+  const MODE_INFO *const above_mi = mi_8x8[-xd->mi_stride];
   const MODE_INFO *const left_mi = xd->left_available ? mi_8x8[-1] : NULL;
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -375,15 +375,15 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   MODE_INFO *m;
 
-  xd->mi_8x8 = cm->mi_grid_visible + (mi_row * cm->mode_info_stride + mi_col);
-  m = xd->mi_8x8[0];
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  m = xd->mi[0];
 
   set_mi_row_col(xd, tile,
                  mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
                  mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
                  cm->mi_rows, cm->mi_cols);
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cpi, xd->mi_8x8, w);
+    write_mb_modes_kf(cpi, xd->mi, w);
   } else {
     pack_inter_mode_mvs(cpi, m, w);
   }
@@ -392,12 +392,10 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
   pack_mb_tokens(w, tok, tok_end);
 }
 
-static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col,
+static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
+                            int hbs, int mi_row, int mi_col,
                             PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int ctx = partition_plane_context(cpi->above_seg_context,
-                                          cpi->left_seg_context,
-                                          mi_row, mi_col, bsize);
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
   const vp9_prob *const probs = get_partition_probs(cm, ctx);
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
@@ -415,21 +413,24 @@ static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col,
   }
 }
 
-static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
+static void write_modes_sb(VP9_COMP *cpi,
+                           const TileInfo *const tile,
                            vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
                            int mi_row, int mi_col, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+
   const int bsl = b_width_log2(bsize);
   const int bs = (1 << bsl) / 4;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mode_info_stride + mi_col];
+  MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
   partition = partition_lookup[bsl][m->mbmi.sb_type];
-  write_partition(cpi, bs, mi_row, mi_col, partition, bsize, w);
+  write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
   subsize = get_subsize(bsize, partition);
   if (subsize < BLOCK_8X8) {
     write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
@@ -465,29 +466,30 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
-                             mi_row, mi_col, subsize, bsize);
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
-static void write_modes(VP9_COMP *cpi, const TileInfo *const tile,
+static void write_modes(VP9_COMP *cpi,
+                        const TileInfo *const tile,
                         vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
   int mi_row, mi_col;
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
-      vp9_zero(cpi->left_seg_context);
+    vp9_zero(cpi->mb.e_mbd.left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, BLOCK_64X64);
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
+                     BLOCK_64X64);
   }
 }
 
-static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
+static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size,
+                                    vp9_coeff_stats *coef_branch_ct) {
   vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[tx_size];
   vp9_coeff_count *coef_counts = cpi->coef_counts[tx_size];
   unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
       cpi->common.counts.eob_branch[tx_size];
-  vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[tx_size];
   int i, j, k, l, m;
 
   for (i = 0; i < PLANE_TYPES; ++i) {
@@ -510,16 +512,16 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
 }
 
 static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
-                                     TX_SIZE tx_size) {
+                                     TX_SIZE tx_size,
+                                     vp9_coeff_stats *frame_branch_ct) {
   vp9_coeff_probs_model *new_frame_coef_probs = cpi->frame_coef_probs[tx_size];
   vp9_coeff_probs_model *old_frame_coef_probs =
       cpi->common.fc.coef_probs[tx_size];
-  vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size];
   const vp9_prob upd = DIFF_UPDATE_PROB;
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
   int i, j, k, l, t;
   switch (cpi->sf.use_fast_coef_updates) {
-    case 0: {
+    case TWO_LOOP: {
       /* dry run to see if there is any udpate at all needed */
       int savings = 0;
       int update[2] = {0, 0};
@@ -594,14 +596,14 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
       return;
     }
 
-    case 1:
-    case 2: {
+    case ONE_LOOP:
+    case ONE_LOOP_REDUCED: {
       const int prev_coef_contexts_to_update =
-          cpi->sf.use_fast_coef_updates == 2 ? COEFF_CONTEXTS >> 1
-                                             : COEFF_CONTEXTS;
+          cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED ?
+              COEFF_CONTEXTS >> 1 : COEFF_CONTEXTS;
       const int coef_band_to_update =
-          cpi->sf.use_fast_coef_updates == 2 ? COEF_BANDS >> 1
-                                             : COEF_BANDS;
+          cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED ?
+              COEF_BANDS >> 1 : COEF_BANDS;
       int updates = 0;
       int noupdates_before_first = 0;
       for (i = 0; i < PLANE_TYPES; ++i) {
@@ -667,13 +669,15 @@ static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
   const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   TX_SIZE tx_size;
+  vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
+
   vp9_clear_system_state();
 
   for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
-    build_tree_distribution(cpi, tx_size);
+    build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size]);
 
   for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-    update_coef_probs_common(w, cpi, tx_size);
+    update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size]);
 }
 
 static void encode_loopfilter(struct loopfilter *lf,
@@ -930,7 +934,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
 
-  vpx_memset(cpi->above_seg_context, 0, sizeof(*cpi->above_seg_context) *
+  vpx_memset(cm->above_seg_context, 0, sizeof(*cm->above_seg_context) *
              mi_cols_aligned_to_sb(cm->mi_cols));
 
   tok[0][0] = cpi->tok;
@@ -1027,19 +1031,22 @@ static void write_sync_code(struct vp9_write_bit_buffer *wb) {
   vp9_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
 }
 
+static void write_profile(BITSTREAM_PROFILE profile,
+                          struct vp9_write_bit_buffer *wb) {
+  assert(profile < MAX_PROFILES);
+  vp9_wb_write_bit(wb, profile & 1);
+  vp9_wb_write_bit(wb, profile >> 1);
+}
+
 static void write_uncompressed_header(VP9_COMP *cpi,
                                       struct vp9_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
 
   vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
 
-  // bitstream version.
-  // 00 - profile 0. 4:2:0 only
-  // 10 - profile 1. adds 4:4:4, 4:2:2, alpha
-  vp9_wb_write_bit(wb, cm->version);
-  vp9_wb_write_bit(wb, 0);
+  write_profile(cm->profile, wb);
 
-  vp9_wb_write_bit(wb, 0);
+  vp9_wb_write_bit(wb, 0);  // show_existing_frame
   vp9_wb_write_bit(wb, cm->frame_type);
   vp9_wb_write_bit(wb, cm->show_frame);
   vp9_wb_write_bit(wb, cm->error_resilient_mode);
@@ -1047,16 +1054,20 @@ static void write_uncompressed_header(VP9_COMP *cpi,
   if (cm->frame_type == KEY_FRAME) {
     const COLOR_SPACE cs = UNKNOWN;
     write_sync_code(wb);
+    if (cm->profile > PROFILE_1) {
+      assert(cm->bit_depth > BITS_8);
+      vp9_wb_write_bit(wb, cm->bit_depth - BITS_10);
+    }
     vp9_wb_write_literal(wb, cs, 3);
     if (cs != SRGB) {
       vp9_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
-      if (cm->version == 1) {
+      if (cm->profile >= PROFILE_1) {
         vp9_wb_write_bit(wb, cm->subsampling_x);
         vp9_wb_write_bit(wb, cm->subsampling_y);
         vp9_wb_write_bit(wb, 0);  // has extra plane
       }
     } else {
-      assert(cm->version == 1);
+      assert(cm->profile == PROFILE_1);
       vp9_wb_write_bit(wb, 0);  // has extra plane
     }
 
@@ -1184,7 +1195,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
 
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
   uint8_t *data = dest;
-  size_t first_part_size;
+  size_t first_part_size, uncompressed_hdr_size;
   struct vp9_write_bit_buffer wb = {data, 0};
   struct vp9_write_bit_buffer saved_wb;
 
@@ -1192,7 +1203,8 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
   saved_wb = wb;
   vp9_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
 
-  data += vp9_rb_bytes_written(&wb);
+  uncompressed_hdr_size = vp9_rb_bytes_written(&wb);
+  data += uncompressed_hdr_size;
 
   vp9_compute_update_table();
 
diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h
index 888984c..7729d84 100644
--- a/libvpx/vp9/encoder/vp9_block.h
+++ b/libvpx/vp9/encoder/vp9_block.h
@@ -157,7 +157,6 @@ struct macroblock {
 
   // note that token_costs is the cost when eob node is skipped
   vp9_coeff_cost token_costs[TX_SIZES];
-  DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
 
   int optimize;
 
@@ -197,7 +196,8 @@ struct macroblock {
 // TODO(jingning): the variables used here are little complicated. need further
 // refactoring on organizing the temporary buffers, when recursive
 // partition down to 4x4 block size is enabled.
-static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
+static INLINE PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
+                                                   BLOCK_SIZE bsize) {
   switch (bsize) {
     case BLOCK_64X64:
       return &x->sb64_context;
diff --git a/libvpx/vp9/encoder/vp9_craq.h b/libvpx/vp9/encoder/vp9_craq.h
deleted file mode 100644
index 1f81f3e..0000000
--- a/libvpx/vp9/encoder/vp9_craq.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_ENCODER_VP9_CRAQ_H_
-#define VP9_ENCODER_VP9_CRAQ_H_
-
-#include "vp9/encoder/vp9_onyx_int.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Check if we should turn off cyclic refresh based on bitrate condition.
-static int apply_cyclic_refresh_bitrate(VP9_COMP *const cpi);
-
-// Check if this coding block, of size bsize, should be considered for refresh
-// (lower-qp coding).
-static int candidate_refresh_aq(VP9_COMP *const cpi,
-                                MODE_INFO *const mi,
-                                int bsize,
-                                int use_rd);
-
-// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
-// check if we should reset the segment_id, and update the cyclic_refresh map
-// and segmentation map.
-void vp9_update_segment_aq(VP9_COMP *const cpi,
-                           MODE_INFO *const mi,
-                           int mi_row,
-                           int mi_col,
-                           int bsize,
-                           int use_rd);
-
-// Setup cyclic background refresh: set delta q and segmentation map.
-void vp9_setup_cyclic_refresh_aq(VP9_COMP *const cpi);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_ENCODER_VP9_CRAQ_H_
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index 2f6c33d..61a5022 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -30,6 +30,9 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
 
+#include "vp9/encoder/vp9_aq_complexity.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_aq_variance.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
@@ -38,8 +41,6 @@
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_vaq.h"
-#include "vp9/encoder/vp9_craq.h"
 
 #define GF_ZEROMV_ZBIN_BOOST 0
 #define LF_ZEROMV_ZBIN_BOOST 0
@@ -162,15 +163,14 @@ static INLINE void set_modeinfo_offsets(VP9_COMMON *const cm,
                                         MACROBLOCKD *const xd,
                                         int mi_row,
                                         int mi_col) {
-  const int idx_str = xd->mode_info_stride * mi_row + mi_col;
-  xd->mi_8x8 = cm->mi_grid_visible + idx_str;
-  xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
-  xd->mi_8x8[0] = cm->mi + idx_str;
+  const int idx_str = xd->mi_stride * mi_row + mi_col;
+  xd->mi = cm->mi_grid_visible + idx_str;
+  xd->mi[0] = cm->mi + idx_str;
 }
 
-static int is_block_in_mb_map(VP9_COMP *cpi, int mi_row, int mi_col,
+static int is_block_in_mb_map(const VP9_COMP *cpi, int mi_row, int mi_col,
                               BLOCK_SIZE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
+  const VP9_COMMON *const cm = &cpi->common;
   const int mb_rows = cm->mb_rows;
   const int mb_cols = cm->mb_cols;
   const int mb_row = mi_row >> 1;
@@ -194,6 +194,16 @@ static int is_block_in_mb_map(VP9_COMP *cpi, int mi_row, int mi_col,
   return 0;
 }
 
+static int check_active_map(const VP9_COMP *cpi, const MACROBLOCK *x,
+                            int mi_row, int mi_col,
+                            BLOCK_SIZE bsize) {
+  if (cpi->active_map_enabled && !x->e_mbd.lossless) {
+    return is_block_in_mb_map(cpi, mi_row, mi_col, bsize);
+  } else {
+    return 1;
+  }
+}
+
 static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
                         int mi_row, int mi_col, BLOCK_SIZE bsize) {
   MACROBLOCK *const x = &cpi->mb;
@@ -207,20 +217,15 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
   const int idx_map = mb_row * cm->mb_cols + mb_col;
   const struct segmentation *const seg = &cm->seg;
 
-  set_skip_context(xd, cpi->above_context, cpi->left_context, mi_row, mi_col);
+  set_skip_context(xd, mi_row, mi_col);
 
   // Activity map pointer
   x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
-
-  if (cpi->active_map_enabled && !x->e_mbd.lossless) {
-    x->in_active_map = is_block_in_mb_map(cpi, mi_row, mi_col, bsize);
-  } else {
-    x->in_active_map = 1;
-  }
+  x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
 
   set_modeinfo_offsets(cm, xd, mi_row, mi_col);
 
-  mbmi = &xd->mi_8x8[0]->mbmi;
+  mbmi = &xd->mi[0]->mbmi;
 
   // Set up destination pointers.
   vp9_setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col);
@@ -253,22 +258,6 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
     }
     vp9_init_plane_quantizers(cpi, x);
 
-    if (seg->enabled && cpi->seg0_cnt > 0 &&
-        !vp9_segfeature_active(seg, 0, SEG_LVL_REF_FRAME) &&
-        vp9_segfeature_active(seg, 1, SEG_LVL_REF_FRAME)) {
-      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
-    } else {
-      const int y = mb_row & ~3;
-      const int x = mb_col & ~3;
-      const int p16 = ((mb_row & 1) << 1) + (mb_col & 1);
-      const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
-      const int tile_progress = tile->mi_col_start * cm->mb_rows >> 1;
-      const int mb_cols = (tile->mi_col_end - tile->mi_col_start) >> 1;
-
-      cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress)
-          << 16) / cm->MBs;
-    }
-
     x->encode_breakout = cpi->segment_encode_breakout[mbmi->segment_id];
   } else {
     mbmi->segment_id = 0;
@@ -276,19 +265,18 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
-static void duplicate_modeinfo_in_sb(VP9_COMMON * const cm,
+static void duplicate_mode_info_in_sb(VP9_COMMON * const cm,
                                      MACROBLOCKD *const xd,
                                      int mi_row,
                                      int mi_col,
                                      BLOCK_SIZE bsize) {
   const int block_width = num_8x8_blocks_wide_lookup[bsize];
   const int block_height = num_8x8_blocks_high_lookup[bsize];
-  const int mis = xd->mode_info_stride;
   int i, j;
   for (j = 0; j < block_height; ++j)
     for (i = 0; i < block_width; ++i) {
       if (mi_row + j < cm->mi_rows && mi_col + i < cm->mi_cols)
-        xd->mi_8x8[j * mis + i] = xd->mi_8x8[0];
+        xd->mi[j * xd->mi_stride + i] = xd->mi[0];
     }
 }
 
@@ -299,8 +287,8 @@ static void set_block_size(VP9_COMP * const cpi,
   if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
     MACROBLOCKD *const xd = &cpi->mb.e_mbd;
     set_modeinfo_offsets(&cpi->common, xd, mi_row, mi_col);
-    xd->mi_8x8[0]->mbmi.sb_type = bsize;
-    duplicate_modeinfo_in_sb(&cpi->common, xd, mi_row, mi_col, bsize);
+    xd->mi[0]->mbmi.sb_type = bsize;
+    duplicate_mode_info_in_sb(&cpi->common, xd, mi_row, mi_col, bsize);
   }
 }
 
@@ -496,13 +484,13 @@ static void choose_partitioning(VP9_COMP *cpi,
   if (cm->frame_type != KEY_FRAME) {
     vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf);
 
-    xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
-    xd->mi_8x8[0]->mbmi.sb_type = BLOCK_64X64;
+    xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
+    xd->mi[0]->mbmi.sb_type = BLOCK_64X64;
     vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv,
-                          xd->mi_8x8[0]->mbmi.ref_mvs[LAST_FRAME],
+                          xd->mi[0]->mbmi.ref_mvs[LAST_FRAME],
                           &nearest_mv, &near_mv);
 
-    xd->mi_8x8[0]->mbmi.mv[0] = nearest_mv;
+    xd->mi[0]->mbmi.mv[0] = nearest_mv;
     vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64);
 
     d = xd->plane[0].dst.buf;
@@ -829,52 +817,6 @@ static void activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
   adjust_act_zbin(cpi, x);
 }
 
-// Select a segment for the current SB64
-static void select_in_frame_q_segment(VP9_COMP *cpi,
-                                      int mi_row, int mi_col,
-                                      int output_enabled, int projected_rate) {
-  VP9_COMMON *const cm = &cpi->common;
-
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-  const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
-  int complexity_metric = 64;
-  int x, y;
-
-  unsigned char segment;
-
-  if (!output_enabled) {
-    segment = 0;
-  } else {
-    // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
-    // It is converted to bits * 256 units
-    const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
-                            (bw * bh);
-
-    if (projected_rate < (target_rate / 4)) {
-      segment = 1;
-    } else {
-      segment = 0;
-    }
-
-    if (target_rate > 0) {
-      complexity_metric =
-        clamp((int)((projected_rate * 64) / target_rate), 16, 255);
-    }
-  }
-
-  // Fill in the entires in the segment map corresponding to this SB64
-  for (y = 0; y < ymis; y++) {
-    for (x = 0; x < xmis; x++) {
-      cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
-      cpi->complexity_map[mi_offset + y * cm->mi_cols + x] =
-        (unsigned char)complexity_metric;
-    }
-  }
-}
-
 static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                          int mi_row, int mi_col, BLOCK_SIZE bsize,
                          int output_enabled) {
@@ -885,32 +827,37 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
   MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-  MODE_INFO *mi_addr = xd->mi_8x8[0];
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi_addr = xd->mi[0];
   const struct segmentation *const seg = &cm->seg;
 
-  const int mis = cm->mode_info_stride;
+  const int mis = cm->mi_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   int max_plane;
 
   assert(mi->mbmi.sb_type == bsize);
 
-  // For in frame adaptive Q copy over the chosen segment id into the
-  // mode innfo context for the chosen mode / partition.
-  if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ ||
-      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) &&
-      output_enabled) {
-    // Check for reseting segment_id and update cyclic map.
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && seg->enabled) {
-      vp9_update_segment_aq(cpi, xd->mi_8x8[0], mi_row, mi_col, bsize, 1);
+  *mi_addr = *mi;
+
+  // If segmentation in use
+  if (seg->enabled && output_enabled) {
+    // For in frame complexity AQ copy the segment id from the segment map.
+    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      mi_addr->mbmi.segment_id =
+        vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    // Else for cyclic refresh mode update the segment map, set the segment id
+    // and then update the quantizer.
+    else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi,
+                                        mi_row, mi_col, bsize, 1);
       vp9_init_plane_quantizers(cpi, x);
     }
-    mi->mbmi.segment_id = xd->mi_8x8[0]->mbmi.segment_id;
   }
 
-  *mi_addr = *mi;
-
   max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
   for (i = 0; i < max_plane; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][1];
@@ -932,7 +879,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     for (x_idx = 0; x_idx < mi_width; x_idx++)
       if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
         && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
-        xd->mi_8x8[x_idx + y * mis] = mi_addr;
+        xd->mi[x_idx + y * mis] = mi_addr;
       }
 
   if (cpi->oxcf.aq_mode)
@@ -1051,7 +998,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   }
 
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
-  mbmi = &xd->mi_8x8[0]->mbmi;
+  mbmi = &xd->mi[0]->mbmi;
   mbmi->sb_type = bsize;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -1101,12 +1048,12 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
                         (mi_col <= 1) || (mi_col >= (cm->mi_cols - 2));
     if (!is_edge && (complexity > 128))
       x->rdmult += ((x->rdmult * (complexity - 128)) / 256);
-  } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+  } else if (aq_mode == CYCLIC_REFRESH_AQ) {
     const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
-        : cm->last_frame_seg_map;
+                                                  : cm->last_frame_seg_map;
     // If segment 1, use rdmult for that segment.
     if (vp9_get_segment_id(cm, map, bsize, mi_row, mi_col))
-      x->rdmult = cpi->cyclic_refresh.rdmult;
+      x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
   }
 
   // Find best coding mode & reconstruct the MB so it is available
@@ -1129,8 +1076,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
       vp9_clear_system_state();
       *totalrate = (int)round(*totalrate * rdmult_ratio);
     }
-  } else if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) ||
-      (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)) {
+  } else if (aq_mode == COMPLEXITY_AQ || aq_mode == CYCLIC_REFRESH_AQ) {
     x->rdmult = orig_rdmult;
   }
 }
@@ -1139,7 +1085,7 @@ static void update_stats(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const MACROBLOCK *const x = &cpi->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const MODE_INFO *const mi = xd->mi_8x8[0];
+  const MODE_INFO *const mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (!frame_is_intra_only(cm)) {
@@ -1206,21 +1152,21 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
   int mi_height = num_8x8_blocks_high_lookup[bsize];
   for (p = 0; p < MAX_MB_PLANE; p++) {
     vpx_memcpy(
-        cpi->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+        xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
         a + num_4x4_blocks_wide * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
     vpx_memcpy(
-        cpi->left_context[p]
+        xd->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
         l + num_4x4_blocks_high * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  vpx_memcpy(cpi->above_seg_context + mi_col, sa,
-             sizeof(*cpi->above_seg_context) * mi_width);
-  vpx_memcpy(cpi->left_seg_context + (mi_row & MI_MASK), sl,
-             sizeof(cpi->left_seg_context[0]) * mi_height);
+  vpx_memcpy(xd->above_seg_context + mi_col, sa,
+             sizeof(*xd->above_seg_context) * mi_width);
+  vpx_memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
+             sizeof(xd->left_seg_context[0]) * mi_height);
 }
 static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
@@ -1239,20 +1185,20 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
   for (p = 0; p < MAX_MB_PLANE; ++p) {
     vpx_memcpy(
         a + num_4x4_blocks_wide * p,
-        cpi->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+        xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
     vpx_memcpy(
         l + num_4x4_blocks_high * p,
-        cpi->left_context[p]
+        xd->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  vpx_memcpy(sa, cpi->above_seg_context + mi_col,
-             sizeof(*cpi->above_seg_context) * mi_width);
-  vpx_memcpy(sl, cpi->left_seg_context + (mi_row & MI_MASK),
-             sizeof(cpi->left_seg_context[0]) * mi_height);
+  vpx_memcpy(sa, xd->above_seg_context + mi_col,
+             sizeof(*xd->above_seg_context) * mi_width);
+  vpx_memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
+             sizeof(xd->left_seg_context[0]) * mi_height);
 }
 
 static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
@@ -1284,6 +1230,8 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
                       int output_enabled, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
   const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
   int ctx;
   PARTITION_TYPE partition;
@@ -1293,8 +1241,7 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
     return;
 
   if (bsize >= BLOCK_8X8) {
-    ctx = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
-                                 mi_row, mi_col, bsize);
+    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
     subsize = *get_sb_partitioning(x, bsize);
   } else {
     ctx = 0;
@@ -1349,8 +1296,7 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
-    update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
-                             mi_row, mi_col, subsize, bsize);
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
 // Check to see if the given partition size is allowed for a specified number
@@ -1382,7 +1328,7 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
                                    MODE_INFO **mi_8x8, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mode_info_stride;
+  const int mis = cm->mi_stride;
   int row8x8_remaining = tile->mi_row_end - mi_row;
   int col8x8_remaining = tile->mi_col_end - mi_col;
   int block_row, block_col;
@@ -1418,15 +1364,79 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
+static void constrain_copy_partitioning(VP9_COMP *const cpi,
+                                        const TileInfo *const tile,
+                                        MODE_INFO **mi_8x8,
+                                        MODE_INFO **prev_mi_8x8,
+                                        int mi_row, int mi_col,
+                                        BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int mis = cm->mi_stride;
+  const int row8x8_remaining = tile->mi_row_end - mi_row;
+  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  MODE_INFO *const mi_upper_left = cm->mi + mi_row * mis + mi_col;
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  int block_row, block_col;
+
+  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+  // If the SB64 if it is all "in image".
+  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+      (row8x8_remaining >= MI_BLOCK_SIZE)) {
+    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
+      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
+        const int index = block_row * mis + block_col;
+        MODE_INFO *prev_mi = prev_mi_8x8[index];
+        const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
+        // Use previous partition if block size is not larger than bsize.
+        if (prev_mi && sb_type <= bsize) {
+          int block_row2, block_col2;
+          for (block_row2 = 0; block_row2 < bh; ++block_row2) {
+            for (block_col2 = 0; block_col2 < bw; ++block_col2) {
+              const int index2 = (block_row + block_row2) * mis +
+                  block_col + block_col2;
+              prev_mi = prev_mi_8x8[index2];
+              if (prev_mi) {
+                const ptrdiff_t offset = prev_mi - cm->prev_mi;
+                mi_8x8[index2] = cm->mi + offset;
+                mi_8x8[index2]->mbmi.sb_type = prev_mi->mbmi.sb_type;
+              }
+            }
+          }
+        } else {
+          // Otherwise, use fixed partition of size bsize.
+          mi_8x8[index] = mi_upper_left + index;
+          mi_8x8[index]->mbmi.sb_type = bsize;
+        }
+      }
+    }
+  } else {
+    // Else this is a partial SB64, copy previous partition.
+    for (block_row = 0; block_row < 8; ++block_row) {
+      for (block_col = 0; block_col < 8; ++block_col) {
+        MODE_INFO *const prev_mi = prev_mi_8x8[block_row * mis + block_col];
+        const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
+        if (prev_mi) {
+          const ptrdiff_t offset = prev_mi - cm->prev_mi;
+          mi_8x8[block_row * mis + block_col] = cm->mi + offset;
+          mi_8x8[block_row * mis + block_col]->mbmi.sb_type = sb_type;
+        }
+      }
+    }
+  }
+}
+
 static void copy_partitioning(VP9_COMMON *cm, MODE_INFO **mi_8x8,
                               MODE_INFO **prev_mi_8x8) {
-  const int mis = cm->mode_info_stride;
+  const int mis = cm->mi_stride;
   int block_row, block_col;
 
   for (block_row = 0; block_row < 8; ++block_row) {
     for (block_col = 0; block_col < 8; ++block_col) {
       MODE_INFO *const prev_mi = prev_mi_8x8[block_row * mis + block_col];
       const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
+
       if (prev_mi) {
         const ptrdiff_t offset = prev_mi - cm->prev_mi;
         mi_8x8[block_row * mis + block_col] = cm->mi + offset;
@@ -1436,8 +1446,127 @@ static void copy_partitioning(VP9_COMMON *cm, MODE_INFO **mi_8x8,
   }
 }
 
+const struct {
+  int row;
+  int col;
+} coord_lookup[16] = {
+    // 32x32 index = 0
+    {0, 0}, {0, 2}, {2, 0}, {2, 2},
+    // 32x32 index = 1
+    {0, 4}, {0, 6}, {2, 4}, {2, 6},
+    // 32x32 index = 2
+    {4, 0}, {4, 2}, {6, 0}, {6, 2},
+    // 32x32 index = 3
+    {4, 4}, {4, 6}, {6, 4}, {6, 6},
+};
+
+static void set_source_var_based_partition(VP9_COMP *cpi,
+                                           const TileInfo *const tile,
+                                           MODE_INFO **mi_8x8,
+                                           int mi_row, int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *x = &cpi->mb;
+  const int mis = cm->mi_stride;
+  int row8x8_remaining = tile->mi_row_end - mi_row;
+  int col8x8_remaining = tile->mi_col_end - mi_col;
+  int r, c;
+  MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col;
+
+  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+  // In-image SB64
+  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+      (row8x8_remaining >= MI_BLOCK_SIZE)) {
+    const int src_stride = x->plane[0].src.stride;
+    const int pre_stride = cpi->Last_Source->y_stride;
+    const uint8_t *src = x->plane[0].src.buf;
+    const int pre_offset = (mi_row * MI_SIZE) * pre_stride +
+                           (mi_col * MI_SIZE);
+    const uint8_t *pre_src = cpi->Last_Source->y_buffer + pre_offset;
+    const int thr_32x32 = cpi->sf.source_var_thresh;
+    const int thr_64x64 = thr_32x32 << 1;
+    int i, j;
+    int index;
+    diff d32[4];
+    int use16x16 = 0;
+
+    for (i = 0; i < 4; i++) {
+      diff d16[4];
+
+      for (j = 0; j < 4; j++) {
+        int b_mi_row = coord_lookup[i * 4 + j].row;
+        int b_mi_col = coord_lookup[i * 4 + j].col;
+        int b_offset = b_mi_row * MI_SIZE * src_stride +
+                       b_mi_col * MI_SIZE;
+
+        vp9_get_sse_sum_16x16(src + b_offset,
+                              src_stride,
+                              pre_src + b_offset,
+                              pre_stride, &d16[j].sse, &d16[j].sum);
+
+        d16[j].var = d16[j].sse -
+            (((uint32_t)d16[j].sum * d16[j].sum) >> 8);
+
+        index = b_mi_row * mis + b_mi_col;
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->mbmi.sb_type = BLOCK_16X16;
+
+        // TODO(yunqingwang): If d16[j].var is very large, use 8x8 partition
+        // size to further improve quality.
+      }
+
+      if (d16[0].var < thr_32x32 && d16[1].var < thr_32x32 &&
+          d16[2].var < thr_32x32 && d16[3].var < thr_32x32) {
+        d32[i].sse = d16[0].sse;
+        d32[i].sum = d16[0].sum;
+
+        for (j = 1; j < 4; j++) {
+          d32[i].sse += d16[j].sse;
+          d32[i].sum += d16[j].sum;
+        }
+
+        d32[i].var = d32[i].sse - (((int64_t)d32[i].sum * d32[i].sum) >> 10);
+
+        index = coord_lookup[i*4].row * mis + coord_lookup[i*4].col;
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->mbmi.sb_type = BLOCK_32X32;
+
+        if (!((cm->current_video_frame - 1) %
+            cpi->sf.search_type_check_frequency))
+          cpi->use_large_partition_rate += 1;
+      } else {
+        use16x16 = 1;
+      }
+    }
+
+    if (!use16x16) {
+      if (d32[0].var < thr_64x64 && d32[1].var < thr_64x64 &&
+          d32[2].var < thr_64x64 && d32[3].var < thr_64x64)  {
+        mi_8x8[0] = mi_upper_left;
+        mi_8x8[0]->mbmi.sb_type = BLOCK_64X64;
+      }
+    }
+  } else {   // partial in-image SB64
+    BLOCK_SIZE bsize = BLOCK_16X16;
+    int bh = num_8x8_blocks_high_lookup[bsize];
+    int bw = num_8x8_blocks_wide_lookup[bsize];
+
+    for (r = 0; r < MI_BLOCK_SIZE; r += bh) {
+      for (c = 0; c < MI_BLOCK_SIZE; c += bw) {
+        int index = r * mis + c;
+        // Find a partition size that fits
+        bsize = find_partition_size(bsize,
+                                    (row8x8_remaining - r),
+                                    (col8x8_remaining - c), &bh, &bw);
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->mbmi.sb_type = bsize;
+      }
+    }
+  }
+}
+
 static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) {
-  const int mis = cm->mode_info_stride;
+  const int mis = cm->mi_stride;
   int block_row, block_col;
 
   if (cm->prev_mi) {
@@ -1455,22 +1584,21 @@ static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) {
   return 0;
 }
 
-static void update_state_rt(VP9_COMP *cpi, const PICK_MODE_CONTEXT *ctx,
+static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                             int mi_row, int mi_col, int bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const struct segmentation *const seg = &cm->seg;
 
-  // TODO(jingning) We might need PICK_MODE_CONTEXT to buffer coding modes
-  // associated with variable block sizes. Otherwise, remove this ctx
-  // from argument list.
-  (void)ctx;
+  *(xd->mi[0]) = ctx->mic;
 
-  // Check for reseting segment_id and update cyclic map.
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && seg->enabled) {
-    vp9_update_segment_aq(cpi, xd->mi_8x8[0], mi_row, mi_col, bsize, 1);
+  // For in frame adaptive Q, check for reseting the segment_id and updating
+  // the cyclic refresh map.
+  if ((cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) && seg->enabled) {
+    vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi,
+                                      mi_row, mi_col, bsize, 1);
     vp9_init_plane_quantizers(cpi, x);
   }
 
@@ -1482,11 +1610,13 @@ static void update_state_rt(VP9_COMP *cpi, const PICK_MODE_CONTEXT *ctx,
       ++cm->counts.switchable_interp[pred_ctx][mbmi->interp_filter];
     }
   }
+
+  x->skip = ctx->skip;
 }
 
 static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
-                     TOKENEXTRA **tp, int mi_row, int mi_col,
-                     int output_enabled, BLOCK_SIZE bsize) {
+                        TOKENEXTRA **tp, int mi_row, int mi_col,
+                        int output_enabled, BLOCK_SIZE bsize) {
   MACROBLOCK *const x = &cpi->mb;
 
   if (bsize < BLOCK_8X8) {
@@ -1495,6 +1625,7 @@ static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
     if (x->ab_index > 0)
       return;
   }
+
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
   update_state_rt(cpi, get_block_context(x, bsize), mi_row, mi_col, bsize);
 
@@ -1510,6 +1641,8 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
                          int output_enabled, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
   const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
   int ctx;
   PARTITION_TYPE partition;
@@ -1520,10 +1653,9 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
 
   if (bsize >= BLOCK_8X8) {
     MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-    const int idx_str = xd->mode_info_stride * mi_row + mi_col;
+    const int idx_str = xd->mi_stride * mi_row + mi_col;
     MODE_INFO ** mi_8x8 = cm->mi_grid_visible + idx_str;
-    ctx = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
-                                 mi_row, mi_col, bsize);
+    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
     subsize = mi_8x8[0]->mbmi.sb_type;
   } else {
     ctx = 0;
@@ -1582,8 +1714,7 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
-    update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
-                             mi_row, mi_col, subsize, bsize);
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
 static void rd_use_partition(VP9_COMP *cpi,
@@ -1594,12 +1725,10 @@ static void rd_use_partition(VP9_COMP *cpi,
                              int do_recon) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
-  const int mis = cm->mode_info_stride;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mis = cm->mi_stride;
   const int bsl = b_width_log2(bsize);
-  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-  const int ms = num_4x4_blocks_wide / 2;
-  const int mh = num_4x4_blocks_high / 2;
+  const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2;
   const int bss = (1 << bsl) / 4;
   int i, pl;
   PARTITION_TYPE partition = PARTITION_NONE;
@@ -1618,10 +1747,14 @@ static void rd_use_partition(VP9_COMP *cpi,
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
   BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
+  int do_partition_search = 1;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
+  assert(num_4x4_blocks_wide_lookup[bsize] ==
+         num_4x4_blocks_high_lookup[bsize]);
+
   partition = partition_lookup[bsl][bs_type];
   subsize = get_subsize(bsize, partition);
 
@@ -1641,9 +1774,22 @@ static void rd_use_partition(VP9_COMP *cpi,
   if (bsize == BLOCK_16X16) {
     set_offsets(cpi, tile, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
+  } else {
+    x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
   }
 
-  if (cpi->sf.partition_search_type == SEARCH_PARTITION &&
+  if (!x->in_active_map) {
+    do_partition_search = 0;
+    if (mi_row + (mi_step >> 1) < cm->mi_rows &&
+        mi_col + (mi_step >> 1) < cm->mi_cols) {
+      *(get_sb_partitioning(x, bsize)) = bsize;
+      bs_type = mi_8x8[0]->mbmi.sb_type = bsize;
+      subsize = bsize;
+      partition = PARTITION_NONE;
+    }
+  }
+  if (do_partition_search &&
+      cpi->sf.partition_search_type == SEARCH_PARTITION &&
       cpi->sf.adjust_partitioning_from_last_frame) {
     // Check if any of the sub blocks are further split.
     if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
@@ -1661,15 +1807,13 @@ static void rd_use_partition(VP9_COMP *cpi,
     // If partition is not none try none unless each of the 4 splits are split
     // even further..
     if (partition != PARTITION_NONE && !splits_below &&
-        mi_row + (ms >> 1) < cm->mi_rows &&
-        mi_col + (ms >> 1) < cm->mi_cols) {
+        mi_row + (mi_step >> 1) < cm->mi_rows &&
+        mi_col + (mi_step >> 1) < cm->mi_cols) {
       *(get_sb_partitioning(x, bsize)) = bsize;
       rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
                        get_block_context(x, bsize), INT64_MAX);
 
-      pl = partition_plane_context(cpi->above_seg_context,
-                                   cpi->left_seg_context,
-                                   mi_row, mi_col, bsize);
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
 
       if (none_rate < INT_MAX) {
         none_rate += x->partition_cost[pl][PARTITION_NONE];
@@ -1694,14 +1838,14 @@ static void rd_use_partition(VP9_COMP *cpi,
                        &last_part_dist, subsize,
                        get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
-          bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
+          bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) {
         int rt = 0;
         int64_t dt = 0;
         update_state(cpi, get_block_context(x, subsize), mi_row, mi_col,
                      subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(x, subsize) = 1;
-        rd_pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt,
+        rd_pick_sb_modes(cpi, tile, mi_row + (mi_step >> 1), mi_col, &rt, &dt,
                          subsize, get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
@@ -1719,14 +1863,14 @@ static void rd_use_partition(VP9_COMP *cpi,
                        &last_part_dist, subsize,
                        get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
-          bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
+          bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
         int rt = 0;
         int64_t dt = 0;
         update_state(cpi, get_block_context(x, subsize), mi_row, mi_col,
                      subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(x, subsize) = 1;
-        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt,
+        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (mi_step >> 1), &rt, &dt,
                          subsize, get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
@@ -1742,8 +1886,8 @@ static void rd_use_partition(VP9_COMP *cpi,
       last_part_rate = 0;
       last_part_dist = 0;
       for (i = 0; i < 4; i++) {
-        int x_idx = (i & 1) * (ms >> 1);
-        int y_idx = (i >> 1) * (ms >> 1);
+        int x_idx = (i & 1) * (mi_step >> 1);
+        int y_idx = (i >> 1) * (mi_step >> 1);
         int jj = i >> 1, ii = i & 0x01;
         int rt;
         int64_t dt;
@@ -1769,18 +1913,20 @@ static void rd_use_partition(VP9_COMP *cpi,
       assert(0);
   }
 
-  pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
-                               mi_row, mi_col, bsize);
+  pl = partition_plane_context(xd, mi_row, mi_col, bsize);
   if (last_part_rate < INT_MAX) {
     last_part_rate += x->partition_cost[pl][partition];
     last_part_rd = RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist);
   }
 
-  if (cpi->sf.adjust_partitioning_from_last_frame
+  if (do_partition_search
+      && cpi->sf.adjust_partitioning_from_last_frame
       && cpi->sf.partition_search_type == SEARCH_PARTITION
       && partition != PARTITION_SPLIT && bsize > BLOCK_8X8
-      && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows)
-      && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) {
+      && (mi_row + mi_step < cm->mi_rows ||
+          mi_row + (mi_step >> 1) == cm->mi_rows)
+      && (mi_col + mi_step < cm->mi_cols ||
+          mi_col + (mi_step >> 1) == cm->mi_cols)) {
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
     chosen_rate = 0;
     chosen_dist = 0;
@@ -1788,8 +1934,8 @@ static void rd_use_partition(VP9_COMP *cpi,
 
     // Split partition.
     for (i = 0; i < 4; i++) {
-      int x_idx = (i & 1) * (num_4x4_blocks_wide >> 2);
-      int y_idx = (i >> 1) * (num_4x4_blocks_wide >> 2);
+      int x_idx = (i & 1) * (mi_step >> 1);
+      int y_idx = (i >> 1) * (mi_step >> 1);
       int rt = 0;
       int64_t dt = 0;
       ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
@@ -1823,14 +1969,11 @@ static void rd_use_partition(VP9_COMP *cpi,
         encode_sb(cpi, tile, tp,  mi_row + y_idx, mi_col + x_idx, 0,
                   split_subsize);
 
-      pl = partition_plane_context(cpi->above_seg_context,
-                                   cpi->left_seg_context,
-                                   mi_row + y_idx, mi_col + x_idx,
+      pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
                                    split_subsize);
       chosen_rate += x->partition_cost[pl][PARTITION_NONE];
     }
-    pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
-                                 mi_row, mi_col, bsize);
+    pl = partition_plane_context(xd, mi_row, mi_col, bsize);
     if (chosen_rate < INT_MAX) {
       chosen_rate += x->partition_cost[pl][PARTITION_SPLIT];
       chosen_rd = RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist);
@@ -1868,14 +2011,14 @@ static void rd_use_partition(VP9_COMP *cpi,
     // and and if necessary apply a Q delta using segmentation to get
     // closer to the target.
     if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
-      select_in_frame_q_segment(cpi, mi_row, mi_col,
-                                output_enabled, chosen_rate);
-    }
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-      cpi->cyclic_refresh.projected_rate_sb = chosen_rate;
-      cpi->cyclic_refresh.projected_dist_sb = chosen_dist;
+      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col,
+                                    output_enabled, chosen_rate);
     }
 
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
+                                              chosen_rate, chosen_dist);
+
     encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize);
   }
 
@@ -1923,7 +2066,7 @@ static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8,
       *min_block_size = MIN(*min_block_size, sb_type);
       *max_block_size = MAX(*max_block_size, sb_type);
     }
-    index += xd->mode_info_stride;
+    index += xd->mi_stride;
   }
 }
 
@@ -1939,77 +2082,71 @@ static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
 static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
-                                    int row, int col,
+                                    int mi_row, int mi_col,
                                     BLOCK_SIZE *min_block_size,
                                     BLOCK_SIZE *max_block_size) {
-  VP9_COMMON * const cm = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  MODE_INFO ** mi_8x8 = xd->mi_8x8;
-  MODE_INFO ** prev_mi_8x8 = xd->prev_mi_8x8;
-
+  MODE_INFO **mi_8x8 = xd->mi;
   const int left_in_image = xd->left_available && mi_8x8[-1];
   const int above_in_image = xd->up_available &&
-                             mi_8x8[-xd->mode_info_stride];
-  MODE_INFO ** above_sb64_mi_8x8;
-  MODE_INFO ** left_sb64_mi_8x8;
+                             mi_8x8[-xd->mi_stride];
+  MODE_INFO **above_sb64_mi_8x8;
+  MODE_INFO **left_sb64_mi_8x8;
 
-  int row8x8_remaining = tile->mi_row_end - row;
-  int col8x8_remaining = tile->mi_col_end - col;
+  int row8x8_remaining = tile->mi_row_end - mi_row;
+  int col8x8_remaining = tile->mi_col_end - mi_col;
   int bh, bw;
-
+  BLOCK_SIZE min_size = BLOCK_4X4;
+  BLOCK_SIZE max_size = BLOCK_64X64;
   // Trap case where we do not have a prediction.
-  if (!left_in_image && !above_in_image &&
-      ((cm->frame_type == KEY_FRAME) || !cm->prev_mi)) {
-    *min_block_size = BLOCK_4X4;
-    *max_block_size = BLOCK_64X64;
-  } else {
+  if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
     // Default "min to max" and "max to min"
-    *min_block_size = BLOCK_64X64;
-    *max_block_size = BLOCK_4X4;
+    min_size = BLOCK_64X64;
+    max_size = BLOCK_4X4;
 
     // NOTE: each call to get_sb_partition_size_range() uses the previous
     // passed in values for min and max as a starting point.
-    //
     // Find the min and max partition used in previous frame at this location
-    if (cm->prev_mi && (cm->frame_type != KEY_FRAME)) {
-      get_sb_partition_size_range(cpi, prev_mi_8x8,
-                                  min_block_size, max_block_size);
+    if (cm->frame_type != KEY_FRAME) {
+      MODE_INFO **const prev_mi =
+          &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
+      get_sb_partition_size_range(cpi, prev_mi, &min_size, &max_size);
     }
-
     // Find the min and max partition sizes used in the left SB64
     if (left_in_image) {
       left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE];
       get_sb_partition_size_range(cpi, left_sb64_mi_8x8,
-                                  min_block_size, max_block_size);
+                                  &min_size, &max_size);
     }
-
     // Find the min and max partition sizes used in the above SB64.
     if (above_in_image) {
-      above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE];
+      above_sb64_mi_8x8 = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE];
       get_sb_partition_size_range(cpi, above_sb64_mi_8x8,
-                                  min_block_size, max_block_size);
+                                  &min_size, &max_size);
+    }
+    // adjust observed min and max
+    if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
+      min_size = min_partition_size[min_size];
+      max_size = max_partition_size[max_size];
     }
   }
 
-  // adjust observed min and max
-  if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
-    *min_block_size = min_partition_size[*min_block_size];
-    *max_block_size = max_partition_size[*max_block_size];
-  }
-
-  // Check border cases where max and min from neighbours may not be legal.
-  *max_block_size = find_partition_size(*max_block_size,
-                                        row8x8_remaining, col8x8_remaining,
-                                        &bh, &bw);
-  *min_block_size = MIN(*min_block_size, *max_block_size);
+  // Check border cases where max and min from neighbors may not be legal.
+  max_size = find_partition_size(max_size,
+                                 row8x8_remaining, col8x8_remaining,
+                                 &bh, &bw);
+  min_size = MIN(min_size, max_size);
 
   // When use_square_partition_only is true, make sure at least one square
   // partition is allowed by selecting the next smaller square size as
   // *min_block_size.
   if (cpi->sf.use_square_partition_only &&
-      next_square_size[*max_block_size] < *min_block_size) {
-    *min_block_size = next_square_size[*max_block_size];
+      next_square_size[max_size] < min_size) {
+     min_size = next_square_size[max_size];
   }
+  *min_block_size = min_size;
+  *max_block_size = max_size;
 }
 
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
@@ -2029,7 +2166,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                               int64_t *dist, int do_recon, int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
-  const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   TOKENEXTRA *tp_orig = *tp;
@@ -2042,8 +2180,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
   // Override skipping rectangular partition operations for edge blocks
-  const int force_horz_split = (mi_row + ms >= cm->mi_rows);
-  const int force_vert_split = (mi_col + ms >= cm->mi_cols);
+  const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
+  const int force_vert_split = (mi_col + mi_step >= cm->mi_cols);
   const int xss = x->e_mbd.plane[1].subsampling_x;
   const int yss = x->e_mbd.plane[1].subsampling_y;
 
@@ -2069,6 +2207,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   if (bsize == BLOCK_16X16) {
     set_offsets(cpi, tile, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
+  } else {
+    x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
   }
 
   // Determine partition types in search according to the speed features.
@@ -2110,9 +2250,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                      ctx, best_rd);
     if (this_rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
-        pl = partition_plane_context(cpi->above_seg_context,
-                                     cpi->left_seg_context,
-                                     mi_row, mi_col, bsize);
+        pl = partition_plane_context(xd, mi_row, mi_col, bsize);
         this_rate += x->partition_cost[pl][PARTITION_NONE];
       }
       sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
@@ -2157,8 +2295,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   if (do_split) {
     subsize = get_subsize(bsize, PARTITION_SPLIT);
     for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
-      const int x_idx = (i & 1) * ms;
-      const int y_idx = (i >> 1) * ms;
+      const int x_idx = (i & 1) * mi_step;
+      const int y_idx = (i >> 1) * mi_step;
 
       if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
@@ -2182,9 +2320,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       }
     }
     if (sum_rd < best_rd && i == 4) {
-      pl = partition_plane_context(cpi->above_seg_context,
-                                   cpi->left_seg_context,
-                                   mi_row, mi_col, bsize);
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_SPLIT];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
@@ -2216,7 +2352,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                      get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
 
-    if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
+    if (sum_rd < best_rd && mi_row + mi_step < cm->mi_rows) {
       update_state(cpi, get_block_context(x, subsize), mi_row, mi_col,
                    subsize, 0);
       encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
@@ -2228,7 +2364,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           partition_none_allowed)
         get_block_context(x, subsize)->pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
+      rd_pick_sb_modes(cpi, tile, mi_row + mi_step, mi_col, &this_rate,
                        &this_dist, subsize, get_block_context(x, subsize),
                        best_rd - sum_rd);
       if (this_rate == INT_MAX) {
@@ -2240,9 +2376,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       }
     }
     if (sum_rd < best_rd) {
-      pl = partition_plane_context(cpi->above_seg_context,
-                                   cpi->left_seg_context,
-                                   mi_row, mi_col, bsize);
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_HORZ];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
@@ -2269,7 +2403,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
                      get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-    if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
+    if (sum_rd < best_rd && mi_col + mi_step < cm->mi_cols) {
       update_state(cpi, get_block_context(x, subsize), mi_row, mi_col,
                    subsize, 0);
       encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
@@ -2281,7 +2415,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           partition_none_allowed)
         get_block_context(x, subsize)->pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col + mi_step, &this_rate,
                        &this_dist, subsize, get_block_context(x, subsize),
                        best_rd - sum_rd);
       if (this_rate == INT_MAX) {
@@ -2293,9 +2427,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       }
     }
     if (sum_rd < best_rd) {
-      pl = partition_plane_context(cpi->above_seg_context,
-                                   cpi->left_seg_context,
-                                   mi_row, mi_col, bsize);
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_VERT];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
@@ -2323,13 +2455,14 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     // and and if necessary apply a Q delta using segmentation to get
     // closer to the target.
     if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
-      select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled, best_rate);
-    }
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-      cpi->cyclic_refresh.projected_rate_sb = best_rate;
-      cpi->cyclic_refresh.projected_dist_sb = best_dist;
+      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
+                                    best_rate);
     }
 
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
+                                              best_rate, best_dist);
+
     encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize);
   }
   if (bsize == BLOCK_64X64) {
@@ -2344,11 +2477,13 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
 static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
                              int mi_row, TOKENEXTRA **tp) {
   VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  SPEED_FEATURES *const sf = &cpi->sf;
   int mi_col;
 
   // Initialize the left context for the new SB row
-  vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context));
-  vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context));
+  vpx_memset(&xd->left_context, 0, sizeof(xd->left_context));
+  vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
   for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
@@ -2359,7 +2494,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
     BLOCK_SIZE i;
     MACROBLOCK *x = &cpi->mb;
 
-    if (cpi->sf.adaptive_pred_interp_filter) {
+    if (sf->adaptive_pred_interp_filter) {
       for (i = BLOCK_4X4; i < BLOCK_8X8; ++i) {
         const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
         const int num_4x4_h = num_4x4_blocks_high_lookup[i];
@@ -2373,63 +2508,69 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
 
     vp9_zero(cpi->mb.pred_mv);
 
-    if ((cpi->sf.partition_search_type == SEARCH_PARTITION &&
-         cpi->sf.use_lastframe_partitioning) ||
-        cpi->sf.partition_search_type == FIXED_PARTITION ||
-        cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION) {
-      const int idx_str = cm->mode_info_stride * mi_row + mi_col;
+    if ((sf->partition_search_type == SEARCH_PARTITION &&
+         sf->use_lastframe_partitioning) ||
+         sf->partition_search_type == FIXED_PARTITION ||
+         sf->partition_search_type == VAR_BASED_PARTITION ||
+         sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
+      const int idx_str = cm->mi_stride * mi_row + mi_col;
       MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
       MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
       cpi->mb.source_variance = UINT_MAX;
-      if (cpi->sf.partition_search_type == FIXED_PARTITION) {
+      if (sf->partition_search_type == FIXED_PARTITION) {
         set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
         set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
-                               cpi->sf.always_this_block_size);
+                               sf->always_this_block_size);
         rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
-      } else if (cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION) {
+      } else if (sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
         BLOCK_SIZE bsize;
         set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
         bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
         set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
         rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
-      } else if (cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+      } else if (sf->partition_search_type == VAR_BASED_PARTITION) {
         choose_partitioning(cpi, tile, mi_row, mi_col);
         rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
       } else {
         if ((cm->current_video_frame
-            % cpi->sf.last_partitioning_redo_frequency) == 0
+            % sf->last_partitioning_redo_frequency) == 0
             || cm->prev_mi == 0
             || cm->show_frame == 0
             || cm->frame_type == KEY_FRAME
             || cpi->rc.is_src_frame_alt_ref
-            || ((cpi->sf.use_lastframe_partitioning ==
+            || ((sf->use_lastframe_partitioning ==
                  LAST_FRAME_PARTITION_LOW_MOTION) &&
                  sb_has_motion(cm, prev_mi_8x8))) {
           // If required set upper and lower partition size limits
-          if (cpi->sf.auto_min_max_partition_size) {
+          if (sf->auto_min_max_partition_size) {
             set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
             rd_auto_partition_range(cpi, tile, mi_row, mi_col,
-                                    &cpi->sf.min_partition_size,
-                                    &cpi->sf.max_partition_size);
+                                    &sf->min_partition_size,
+                                    &sf->max_partition_size);
           }
           rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
                             &dummy_rate, &dummy_dist, 1, INT64_MAX);
         } else {
-          copy_partitioning(cm, mi_8x8, prev_mi_8x8);
+          if (sf->constrain_copy_partition &&
+              sb_has_motion(cm, prev_mi_8x8))
+            constrain_copy_partitioning(cpi, tile, mi_8x8, prev_mi_8x8,
+                                        mi_row, mi_col, BLOCK_16X16);
+          else
+            copy_partitioning(cm, mi_8x8, prev_mi_8x8);
           rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                            &dummy_rate, &dummy_dist, 1);
         }
       }
     } else {
       // If required set upper and lower partition size limits
-      if (cpi->sf.auto_min_max_partition_size) {
+      if (sf->auto_min_max_partition_size) {
         set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
         rd_auto_partition_range(cpi, tile, mi_row, mi_col,
-                                &cpi->sf.min_partition_size,
-                                &cpi->sf.max_partition_size);
+                                &sf->min_partition_size,
+                                &sf->max_partition_size);
       }
       rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
                         &dummy_rate, &dummy_dist, 1, INT64_MAX);
@@ -2444,9 +2585,6 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
 
   x->act_zbin_adj = 0;
-  cpi->seg0_idx = 0;
-
-  xd->mode_info_stride = cm->mode_info_stride;
 
   // Copy data over into macro block data structures.
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
@@ -2458,27 +2596,16 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
   vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
-  xd->mi_8x8[0]->mbmi.mode = DC_PRED;
-  xd->mi_8x8[0]->mbmi.uv_mode = DC_PRED;
-
-  vp9_zero(cm->counts.y_mode);
-  vp9_zero(cm->counts.uv_mode);
-  vp9_zero(cm->counts.inter_mode);
-  vp9_zero(cm->counts.partition);
-  vp9_zero(cm->counts.intra_inter);
-  vp9_zero(cm->counts.comp_inter);
-  vp9_zero(cm->counts.single_ref);
-  vp9_zero(cm->counts.comp_ref);
-  vp9_zero(cm->counts.tx);
-  vp9_zero(cm->counts.skip);
+  xd->mi[0]->mbmi.mode = DC_PRED;
+  xd->mi[0]->mbmi.uv_mode = DC_PRED;
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(cpi->above_context[0], 0,
-             sizeof(*cpi->above_context[0]) *
+  vpx_memset(xd->above_context[0], 0,
+             sizeof(*xd->above_context[0]) *
              2 * aligned_mi_cols * MAX_MB_PLANE);
-  vpx_memset(cpi->above_seg_context, 0,
-             sizeof(*cpi->above_seg_context) * aligned_mi_cols);
+  vpx_memset(xd->above_seg_context, 0,
+             sizeof(*xd->above_seg_context) * aligned_mi_cols);
 }
 
 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
@@ -2508,100 +2635,15 @@ static int check_dual_ref_flags(VP9_COMP *cpi) {
   }
 }
 
-static int get_skip_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs) {
-  int x, y;
-
-  for (y = 0; y < ymbs; y++) {
-    for (x = 0; x < xmbs; x++) {
-      if (!mi_8x8[y * mis + x]->mbmi.skip)
-        return 0;
-    }
-  }
-
-  return 1;
-}
-
-static void set_txfm_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs,
-                          TX_SIZE tx_size) {
-  int x, y;
-
-  for (y = 0; y < ymbs; y++) {
-    for (x = 0; x < xmbs; x++)
-      mi_8x8[y * mis + x]->mbmi.tx_size = tx_size;
-  }
-}
-
-static void reset_skip_txfm_size_b(const VP9_COMMON *cm, int mis,
-                                   TX_SIZE max_tx_size, int bw, int bh,
-                                   int mi_row, int mi_col,
-                                   MODE_INFO **mi_8x8) {
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
-    return;
-  } else {
-    const MB_MODE_INFO *const mbmi = &mi_8x8[0]->mbmi;
-    if (mbmi->tx_size > max_tx_size) {
-      const int ymbs = MIN(bh, cm->mi_rows - mi_row);
-      const int xmbs = MIN(bw, cm->mi_cols - mi_col);
-
-      assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
-             get_skip_flag(mi_8x8, mis, ymbs, xmbs));
-      set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size);
-    }
-  }
-}
-
-static void reset_skip_txfm_size_sb(VP9_COMMON *cm, MODE_INFO **mi_8x8,
-                                    TX_SIZE max_tx_size, int mi_row, int mi_col,
-                                    BLOCK_SIZE bsize) {
-  const int mis = cm->mode_info_stride;
-  int bw, bh;
-  const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
-    return;
-
-  bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type];
-  bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
-
-  if (bw == bs && bh == bs) {
-    reset_skip_txfm_size_b(cm, mis, max_tx_size, bs, bs, mi_row, mi_col,
-                           mi_8x8);
-  } else if (bw == bs && bh < bs) {
-    reset_skip_txfm_size_b(cm, mis, max_tx_size, bs, hbs, mi_row, mi_col,
-                           mi_8x8);
-    reset_skip_txfm_size_b(cm, mis, max_tx_size, bs, hbs, mi_row + hbs,
-                           mi_col, mi_8x8 + hbs * mis);
-  } else if (bw < bs && bh == bs) {
-    reset_skip_txfm_size_b(cm, mis, max_tx_size, hbs, bs, mi_row, mi_col,
-                           mi_8x8);
-    reset_skip_txfm_size_b(cm, mis, max_tx_size, hbs, bs, mi_row,
-                           mi_col + hbs, mi_8x8 + hbs);
-  } else {
-    const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
-    int n;
-
-    assert(bw < bs && bh < bs);
-
-    for (n = 0; n < 4; n++) {
-      const int mi_dc = hbs * (n & 1);
-      const int mi_dr = hbs * (n >> 1);
-
-      reset_skip_txfm_size_sb(cm, &mi_8x8[mi_dr * mis + mi_dc], max_tx_size,
-                              mi_row + mi_dr, mi_col + mi_dc, subsize);
-    }
-  }
-}
-
 static void reset_skip_txfm_size(VP9_COMMON *cm, TX_SIZE txfm_max) {
   int mi_row, mi_col;
-  const int mis = cm->mode_info_stride;
-  MODE_INFO **mi_8x8, **mi_ptr = cm->mi_grid_visible;
+  const int mis = cm->mi_stride;
+  MODE_INFO **mi_ptr = cm->mi_grid_visible;
 
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) {
-    mi_8x8 = mi_ptr;
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi_8x8 += 8) {
-      reset_skip_txfm_size_sb(cm, mi_8x8, txfm_max, mi_row, mi_col,
-                              BLOCK_64X64);
+  for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
+    for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+      if (mi_ptr[mi_col]->mbmi.tx_size > txfm_max)
+        mi_ptr[mi_col]->mbmi.tx_size = txfm_max;
     }
   }
 }
@@ -2680,16 +2722,347 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
-  xd->mi_8x8[0]->mbmi.sb_type = bsize;
+  xd->mi[0]->mbmi.sb_type = bsize;
 
   if (!frame_is_intra_only(cm)) {
     vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col,
                         rate, dist, bsize);
   } else {
     MB_PREDICTION_MODE intramode = DC_PRED;
-    set_mode_info(&xd->mi_8x8[0]->mbmi, bsize, intramode);
+    set_mode_info(&xd->mi[0]->mbmi, bsize, intramode);
+  }
+  duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+}
+
+static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
+                              int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, BLOCK_SIZE subsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition = partition_lookup[bsl][subsize];
+
+  assert(bsize >= BLOCK_8X8);
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+      *(xd->mi[0]) = get_block_context(x, subsize)->mic;
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+      break;
+    case PARTITION_VERT:
+      *get_sb_index(x, subsize) = 0;
+      set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+      *(xd->mi[0]) = get_block_context(x, subsize)->mic;
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+
+      if (mi_col + hbs < cm->mi_cols) {
+        *get_sb_index(x, subsize) = 1;
+        set_modeinfo_offsets(cm, xd, mi_row, mi_col + hbs);
+        *(xd->mi[0]) = get_block_context(x, subsize)->mic;
+        duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, bsize);
+      }
+      break;
+    case PARTITION_HORZ:
+      *get_sb_index(x, subsize) = 0;
+      set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+      *(xd->mi[0]) = get_block_context(x, subsize)->mic;
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+      if (mi_row + hbs < cm->mi_rows) {
+        *get_sb_index(x, subsize) = 1;
+        set_modeinfo_offsets(cm, xd, mi_row + hbs, mi_col);
+        *(xd->mi[0]) = get_block_context(x, subsize)->mic;
+        duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, bsize);
+      }
+      break;
+    case PARTITION_SPLIT:
+      *get_sb_index(x, subsize) = 0;
+      fill_mode_info_sb(cm, x, mi_row, mi_col, subsize,
+                        *(get_sb_partitioning(x, subsize)));
+      *get_sb_index(x, subsize) = 1;
+      fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize,
+                        *(get_sb_partitioning(x, subsize)));
+      *get_sb_index(x, subsize) = 2;
+      fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize,
+                        *(get_sb_partitioning(x, subsize)));
+      *get_sb_index(x, subsize) = 3;
+      fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize,
+                        *(get_sb_partitioning(x, subsize)));
+      break;
+    default:
+      break;
+  }
+}
+
+static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
+                                 TOKENEXTRA **tp, int mi_row,
+                                 int mi_col, BLOCK_SIZE bsize, int *rate,
+                                 int64_t *dist, int do_recon, int64_t best_rd) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
+  TOKENEXTRA *tp_orig = *tp;
+  PICK_MODE_CONTEXT *ctx = get_block_context(x, bsize);
+  int i;
+  BLOCK_SIZE subsize;
+  int this_rate, sum_rate = 0, best_rate = INT_MAX;
+  int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX;
+  int64_t sum_rd = 0;
+  int do_split = bsize >= BLOCK_8X8;
+  int do_rect = 1;
+  // Override skipping rectangular partition operations for edge blocks
+  const int force_horz_split = (mi_row + ms >= cm->mi_rows);
+  const int force_vert_split = (mi_col + ms >= cm->mi_cols);
+  const int xss = x->e_mbd.plane[1].subsampling_x;
+  const int yss = x->e_mbd.plane[1].subsampling_y;
+
+  int partition_none_allowed = !force_horz_split && !force_vert_split;
+  int partition_horz_allowed = !force_vert_split && yss <= xss &&
+                               bsize >= BLOCK_8X8;
+  int partition_vert_allowed = !force_horz_split && xss <= yss &&
+                               bsize >= BLOCK_8X8;
+  (void) *tp_orig;
+
+  if (bsize < BLOCK_8X8) {
+    // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
+    // there is nothing to be done.
+    if (x->ab_index != 0) {
+      *rate = 0;
+      *dist = 0;
+      return;
+    }
+  }
+
+  assert(num_8x8_blocks_wide_lookup[bsize] ==
+             num_8x8_blocks_high_lookup[bsize]);
+
+  x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
+
+  // Determine partition types in search according to the speed features.
+  // The threshold set here has to be of square block size.
+  if (cpi->sf.auto_min_max_partition_size) {
+    partition_none_allowed &= (bsize <= cpi->sf.max_partition_size &&
+                               bsize >= cpi->sf.min_partition_size);
+    partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size &&
+                                bsize >  cpi->sf.min_partition_size) ||
+                                force_horz_split);
+    partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size &&
+                                bsize >  cpi->sf.min_partition_size) ||
+                                force_vert_split);
+    do_split &= bsize > cpi->sf.min_partition_size;
+  }
+  if (cpi->sf.use_square_partition_only) {
+    partition_horz_allowed &= force_horz_split;
+    partition_vert_allowed &= force_vert_split;
+  }
+
+  if (!x->in_active_map && (partition_horz_allowed || partition_vert_allowed))
+    do_split = 0;
+
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
+                        &this_rate, &this_dist, bsize);
+    ctx->mic.mbmi = xd->mi[0]->mbmi;
+
+    if (this_rate != INT_MAX) {
+      int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+      this_rate += x->partition_cost[pl][PARTITION_NONE];
+      sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
+      if (sum_rd < best_rd) {
+        int64_t stop_thresh = 4096;
+        int64_t stop_thresh_rd;
+
+        best_rate = this_rate;
+        best_dist = this_dist;
+        best_rd = sum_rd;
+        if (bsize >= BLOCK_8X8)
+          *(get_sb_partitioning(x, bsize)) = bsize;
+
+        // Adjust threshold according to partition size.
+        stop_thresh >>= 8 - (b_width_log2_lookup[bsize] +
+            b_height_log2_lookup[bsize]);
+
+        stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh);
+        // If obtained distortion is very small, choose current partition
+        // and stop splitting.
+        if (!x->e_mbd.lossless && best_rd < stop_thresh_rd) {
+          do_split = 0;
+          do_rect = 0;
+        }
+      }
+    }
+    if (!x->in_active_map) {
+      do_split = 0;
+      do_rect = 0;
+    }
+  }
+
+  // store estimated motion vector
+  store_pred_mv(x, ctx);
+
+  // PARTITION_SPLIT
+  sum_rd = 0;
+  if (do_split) {
+    int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+    sum_rate += x->partition_cost[pl][PARTITION_SPLIT];
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
+    for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+      const int x_idx = (i & 1) * ms;
+      const int y_idx = (i >> 1) * ms;
+
+      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+        continue;
+
+      *get_sb_index(x, subsize) = i;
+      load_pred_mv(x, ctx);
+
+      nonrd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
+                           subsize, &this_rate, &this_dist, 0,
+                           best_rd - sum_rd);
+
+      if (this_rate == INT_MAX) {
+        sum_rd = INT64_MAX;
+      } else {
+        sum_rate += this_rate;
+        sum_dist += this_dist;
+        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      }
+    }
+
+    if (sum_rd < best_rd) {
+      best_rate = sum_rate;
+      best_dist = sum_dist;
+      best_rd = sum_rd;
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    } else {
+      // skip rectangular partition test when larger block size
+      // gives better rd cost
+      if (cpi->sf.less_rectangular_check)
+        do_rect &= !partition_none_allowed;
+    }
+  }
+
+  // PARTITION_HORZ
+  if (partition_horz_allowed && do_rect) {
+    subsize = get_subsize(bsize, PARTITION_HORZ);
+    *get_sb_index(x, subsize) = 0;
+    if (cpi->sf.adaptive_motion_search)
+      load_pred_mv(x, ctx);
+
+    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
+                        &this_rate, &this_dist, subsize);
+
+    get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi;
+
+    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+
+    if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
+      *get_sb_index(x, subsize) = 1;
+
+      load_pred_mv(x, ctx);
+
+      nonrd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col,
+                          &this_rate, &this_dist, subsize);
+
+      get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi;
+
+      if (this_rate == INT_MAX) {
+        sum_rd = INT64_MAX;
+      } else {
+        int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+        this_rate += x->partition_cost[pl][PARTITION_HORZ];
+        sum_rate += this_rate;
+        sum_dist += this_dist;
+        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      }
+    }
+    if (sum_rd < best_rd) {
+      best_rd = sum_rd;
+      best_rate = sum_rate;
+      best_dist = sum_dist;
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    }
+  }
+
+  // PARTITION_VERT
+  if (partition_vert_allowed && do_rect) {
+    subsize = get_subsize(bsize, PARTITION_VERT);
+
+    *get_sb_index(x, subsize) = 0;
+    if (cpi->sf.adaptive_motion_search)
+      load_pred_mv(x, ctx);
+
+    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
+                        &this_rate, &this_dist, subsize);
+    get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi;
+    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+    if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
+      *get_sb_index(x, subsize) = 1;
+
+      load_pred_mv(x, ctx);
+
+      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms,
+                          &this_rate, &this_dist, subsize);
+
+      get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi;
+
+      if (this_rate == INT_MAX) {
+        sum_rd = INT64_MAX;
+      } else {
+        int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+        this_rate += x->partition_cost[pl][PARTITION_VERT];
+        sum_rate += this_rate;
+        sum_dist += this_dist;
+        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      }
+    }
+    if (sum_rd < best_rd) {
+      best_rate = sum_rate;
+      best_dist = sum_dist;
+      best_rd = sum_rd;
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    }
+  }
+
+  *rate = best_rate;
+  *dist = best_dist;
+
+  if (best_rate == INT_MAX)
+    return;
+
+  // update mode info array
+  fill_mode_info_sb(cm, x, mi_row, mi_col, bsize,
+                    *(get_sb_partitioning(x, bsize)));
+
+  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+    int output_enabled = (bsize == BLOCK_64X64);
+
+    // Check the projected output rate for this SB against it's target
+    // and and if necessary apply a Q delta using segmentation to get
+    // closer to the target.
+    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
+      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
+                                    best_rate);
+    }
+
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
+                                              best_rate, best_dist);
+
+    encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize);
+  }
+
+  if (bsize == BLOCK_64X64) {
+    assert(tp_orig < *tp);
+    assert(best_rate < INT_MAX);
+    assert(best_dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
   }
-  duplicate_modeinfo_in_sb(cm, xd, mi_row, mi_col, bsize);
 }
 
 static void nonrd_use_partition(VP9_COMP *cpi,
@@ -2701,35 +3074,34 @@ static void nonrd_use_partition(VP9_COMP *cpi,
                                 int *totrate, int64_t *totdist) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
-  const int mis = cm->mode_info_stride;
+  const int mis = cm->mi_stride;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  int rate;
-  int64_t dist;
+  int rate = INT_MAX;
+  int64_t dist = INT64_MAX;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  if (bsize >= BLOCK_8X8) {
-    subsize = mi_8x8[0]->mbmi.sb_type;
-  } else {
-    subsize = BLOCK_4X4;
-  }
-
+  subsize = (bsize >= BLOCK_8X8) ? mi_8x8[0]->mbmi.sb_type : BLOCK_4X4;
   partition = partition_lookup[bsl][subsize];
 
   switch (partition) {
     case PARTITION_NONE:
       nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
+      get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi;
       break;
     case PARTITION_VERT:
       *get_sb_index(x, subsize) = 0;
       nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
+      get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi;
       if (mi_col + hbs < cm->mi_cols) {
         *get_sb_index(x, subsize) = 1;
         nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs,
                             &rate, &dist, subsize);
+        get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi;
         if (rate != INT_MAX && dist != INT64_MAX &&
             *totrate != INT_MAX && *totdist != INT64_MAX) {
           *totrate += rate;
@@ -2740,10 +3112,12 @@ static void nonrd_use_partition(VP9_COMP *cpi,
     case PARTITION_HORZ:
       *get_sb_index(x, subsize) = 0;
       nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
+      get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi;
       if (mi_row + hbs < cm->mi_rows) {
         *get_sb_index(x, subsize) = 1;
         nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
                             &rate, &dist, subsize);
+        get_block_context(x, subsize)->mic.mbmi = mi_8x8[0]->mbmi;
         if (rate != INT_MAX && dist != INT64_MAX &&
             *totrate != INT_MAX && *totdist != INT64_MAX) {
           *totrate += rate;
@@ -2753,7 +3127,6 @@ static void nonrd_use_partition(VP9_COMP *cpi,
       break;
     case PARTITION_SPLIT:
       subsize = get_subsize(bsize, PARTITION_SPLIT);
-
       *get_sb_index(x, subsize) = 0;
       nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col,
                           subsize, output_enabled, totrate, totdist);
@@ -2790,10 +3163,9 @@ static void nonrd_use_partition(VP9_COMP *cpi,
   }
 
   if (bsize == BLOCK_64X64 && output_enabled) {
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-      cpi->cyclic_refresh.projected_rate_sb = *totrate;
-      cpi->cyclic_refresh.projected_dist_sb = *totdist;
-    }
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
+                                              *totrate, *totdist);
     encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, bsize);
   }
 }
@@ -2801,132 +3173,102 @@ static void nonrd_use_partition(VP9_COMP *cpi,
 static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
                                 int mi_row, TOKENEXTRA **tp) {
   VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
   int mi_col;
 
   // Initialize the left context for the new SB row
-  vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context));
-  vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context));
+  vpx_memset(&xd->left_context, 0, sizeof(xd->left_context));
+  vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
   for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
-    int dummy_rate;
-    int64_t dummy_dist;
-    const int idx_str = cm->mode_info_stride * mi_row + mi_col;
+    int dummy_rate = 0;
+    int64_t dummy_dist = 0;
+    const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
     MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
-
-    BLOCK_SIZE bsize = cpi->sf.partition_search_type == FIXED_PARTITION ?
-        cpi->sf.always_this_block_size :
-        get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col);
+    BLOCK_SIZE bsize;
 
     cpi->mb.source_variance = UINT_MAX;
+    vp9_zero(cpi->mb.pred_mv);
 
     // Set the partition type of the 64X64 block
-    if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
-      choose_partitioning(cpi, tile, mi_row, mi_col);
-    else if (cpi->sf.partition_search_type == REFERENCE_PARTITION) {
-      if (cpi->sf.partition_check) {
-        MACROBLOCK *x = &cpi->mb;
-        int rate1, rate2, rate3;
-        int64_t dist1, dist2, dist3;
-        set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, BLOCK_8X8);
+    switch (cpi->sf.partition_search_type) {
+      case VAR_BASED_PARTITION:
+        choose_partitioning(cpi, tile, mi_row, mi_col);
         nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                            0, &rate1, &dist1);
-        set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, BLOCK_16X16);
+                            1, &dummy_rate, &dummy_dist);
+        break;
+      case SOURCE_VAR_BASED_PARTITION:
+        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+        set_source_var_based_partition(cpi, tile, mi_8x8, mi_row, mi_col);
         nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                            0, &rate2, &dist2);
-        set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, BLOCK_32X32);
+                            1, &dummy_rate, &dummy_dist);
+        break;
+      case VAR_BASED_FIXED_PARTITION:
+      case FIXED_PARTITION:
+        bsize = cpi->sf.partition_search_type == FIXED_PARTITION ?
+                cpi->sf.always_this_block_size :
+                get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col);
+        set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
         nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                            0, &rate3, &dist3);
-
-        if (RDCOST(x->rdmult, x->rddiv, rate1, dist1) <
-            RDCOST(x->rdmult, x->rddiv, rate2, dist2)) {
-          if (RDCOST(x->rdmult, x->rddiv, rate1, dist1) <
-              RDCOST(x->rdmult, x->rddiv, rate3, dist3))
-            set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
-                                   BLOCK_8X8);
-          else
-            set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
-                                   BLOCK_32X32);
+                            1, &dummy_rate, &dummy_dist);
+        break;
+      case REFERENCE_PARTITION:
+        if (cpi->sf.partition_check || sb_has_motion(cm, prev_mi_8x8)) {
+          nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
+                               &dummy_rate, &dummy_dist, 1, INT64_MAX);
         } else {
-          if (RDCOST(x->rdmult, x->rddiv, rate2, dist2) <
-              RDCOST(x->rdmult, x->rddiv, rate3, dist3))
-            set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
-                                   BLOCK_16X16);
-          else
-            set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
-                                   BLOCK_32X32);
-        }
-
-      } else {
-        if (!sb_has_motion(cm, prev_mi_8x8))
           copy_partitioning(cm, mi_8x8, prev_mi_8x8);
-        else
-          set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
-      }
+          nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col,
+                              BLOCK_64X64, 1, &dummy_rate, &dummy_dist);
+        }
+        break;
+      default:
+        assert(0);
     }
-    else
-      set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
-
-    nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, 1,
-                        &dummy_rate, &dummy_dist);
   }
 }
 // end RTC play code
 
 static void encode_frame_internal(VP9_COMP *cpi) {
-  int mi_row;
+  SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-//  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
-//           cpi->common.current_video_frame, cpi->common.show_frame,
-//           cm->frame_type);
-
-  vp9_zero(cm->counts.switchable_interp);
-  vp9_zero(cpi->tx_stepdown_count);
-
-  xd->mi_8x8 = cm->mi_grid_visible;
-  // required for vp9_frame_init_quantizer
-  xd->mi_8x8[0] = cm->mi;
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
 
-  vp9_zero(cm->counts.mv);
+  vp9_zero(cm->counts);
   vp9_zero(cpi->coef_counts);
-  vp9_zero(cm->counts.eob_branch);
+  vp9_zero(cpi->tx_stepdown_count);
+  vp9_zero(cpi->rd_comp_pred_diff);
+  vp9_zero(cpi->rd_filter_diff);
+  vp9_zero(cpi->rd_tx_select_diff);
+  vp9_zero(cpi->rd_tx_select_threshes);
 
-  // Set frame level transform size use case
   cm->tx_mode = select_tx_mode(cpi);
 
-  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0
-      && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 &&
+                           cm->y_dc_delta_q == 0 &&
+                           cm->uv_dc_delta_q == 0 &&
+                           cm->uv_ac_delta_q == 0;
   switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
 
   vp9_frame_init_quantizer(cpi);
 
   vp9_initialize_rd_consts(cpi);
   vp9_initialize_me_consts(cpi, cm->base_qindex);
-
-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-    // Initialize encode frame context.
-    init_encode_frame_mb_context(cpi);
-
-    // Build a frame level activity map
-    build_activity_map(cpi);
-  }
-
-  // Re-initialize encode frame context.
   init_encode_frame_mb_context(cpi);
 
-  vp9_zero(cpi->rd_comp_pred_diff);
-  vp9_zero(cpi->rd_filter_diff);
-  vp9_zero(cpi->rd_tx_select_diff);
-  vp9_zero(cpi->rd_tx_select_threshes);
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    build_activity_map(cpi);
 
-  set_prev_mi(cm);
+  cm->prev_mi = get_prev_mi(cm);
 
-  if (cpi->sf.use_nonrd_pick_mode) {
+  if (sf->use_nonrd_pick_mode) {
     // Initialize internal buffer pointers for rtc coding, where non-RD
     // mode decision is used and hence no buffer pointer swap needed.
     int i;
@@ -2941,6 +3283,29 @@ static void encode_frame_internal(VP9_COMP *cpi) {
       p[i].eobs = ctx->eobs_pbuf[i][0];
     }
     vp9_zero(x->zcoeff_blk);
+
+    if (cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION &&
+        cm->current_video_frame > 0) {
+      int check_freq = cpi->sf.search_type_check_frequency;
+
+      if ((cm->current_video_frame - 1) % check_freq == 0) {
+        cpi->use_large_partition_rate = 0;
+      }
+
+      if ((cm->current_video_frame - 1) % check_freq == 1) {
+        const int mbs_in_b32x32 = 1 << ((b_width_log2_lookup[BLOCK_32X32] -
+                                  b_width_log2_lookup[BLOCK_16X16]) +
+                                  (b_height_log2_lookup[BLOCK_32X32] -
+                                  b_height_log2_lookup[BLOCK_16X16]));
+        cpi->use_large_partition_rate = cpi->use_large_partition_rate * 100 *
+                                        mbs_in_b32x32 / cm->MBs;
+      }
+
+      if ((cm->current_video_frame - 1) % check_freq >= 1) {
+        if (cpi->use_large_partition_rate < 15)
+          cpi->sf.partition_search_type = FIXED_PARTITION;
+      }
+    }
   }
 
   {
@@ -2958,12 +3323,13 @@ static void encode_frame_internal(VP9_COMP *cpi) {
         for (tile_col = 0; tile_col < tile_cols; tile_col++) {
           TileInfo tile;
           TOKENEXTRA *tp_old = tp;
+          int mi_row;
 
           // For each row of SBs in the frame
           vp9_tile_init(&tile, cm, tile_row, tile_col);
           for (mi_row = tile.mi_row_start;
                mi_row < tile.mi_row_end; mi_row += MI_BLOCK_SIZE) {
-            if (cpi->sf.use_nonrd_pick_mode && cm->frame_type != KEY_FRAME)
+            if (sf->use_nonrd_pick_mode && cm->frame_type != KEY_FRAME)
               encode_nonrd_sb_row(cpi, &tile, mi_row, &tp);
             else
               encode_rd_sb_row(cpi, &tile, mi_row, &tp);
@@ -2978,18 +3344,18 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
   }
 
-  if (cpi->sf.skip_encode_sb) {
+  if (sf->skip_encode_sb) {
     int j;
     unsigned int intra_count = 0, inter_count = 0;
     for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
       intra_count += cm->counts.intra_inter[j][0];
       inter_count += cm->counts.intra_inter[j][1];
     }
-    cpi->sf.skip_encode_frame = (intra_count << 2) < inter_count &&
-                                cm->frame_type != KEY_FRAME &&
-                                cm->show_frame;
+    sf->skip_encode_frame = (intra_count << 2) < inter_count &&
+                            cm->frame_type != KEY_FRAME &&
+                            cm->show_frame;
   } else {
-    cpi->sf.skip_encode_frame = 0;
+    sf->skip_encode_frame = 0;
   }
 
 #if 0
@@ -3023,33 +3389,31 @@ void vp9_encode_frame(VP9_COMP *cpi) {
 
   if (cpi->sf.frame_parameter_update) {
     int i;
-    REFERENCE_MODE reference_mode;
-    /*
-     * This code does a single RD pass over the whole frame assuming
-     * either compound, single or hybrid prediction as per whatever has
-     * worked best for that type of frame in the past.
-     * It also predicts whether another coding mode would have worked
-     * better that this coding mode. If that is the case, it remembers
-     * that for subsequent frames.
-     * It does the same analysis for transform size selection also.
-     */
+
+    // This code does a single RD pass over the whole frame assuming
+    // either compound, single or hybrid prediction as per whatever has
+    // worked best for that type of frame in the past.
+    // It also predicts whether another coding mode would have worked
+    // better that this coding mode. If that is the case, it remembers
+    // that for subsequent frames.
+    // It does the same analysis for transform size selection also.
     const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
     const int64_t *mode_thresh = cpi->rd_prediction_type_threshes[frame_type];
     const int64_t *filter_thresh = cpi->rd_filter_threshes[frame_type];
 
     /* prediction (compound, single or hybrid) mode selection */
-    if (frame_type == 3 || !cm->allow_comp_inter_inter)
-      reference_mode = SINGLE_REFERENCE;
+    if (frame_type == ALTREF_FRAME || !cm->allow_comp_inter_inter)
+      cm->reference_mode = SINGLE_REFERENCE;
     else if (mode_thresh[COMPOUND_REFERENCE] > mode_thresh[SINGLE_REFERENCE] &&
              mode_thresh[COMPOUND_REFERENCE] >
                  mode_thresh[REFERENCE_MODE_SELECT] &&
              check_dual_ref_flags(cpi) &&
              cpi->static_mb_pct == 100)
-      reference_mode = COMPOUND_REFERENCE;
+      cm->reference_mode = COMPOUND_REFERENCE;
     else if (mode_thresh[SINGLE_REFERENCE] > mode_thresh[REFERENCE_MODE_SELECT])
-      reference_mode = SINGLE_REFERENCE;
+      cm->reference_mode = SINGLE_REFERENCE;
     else
-      reference_mode = REFERENCE_MODE_SELECT;
+      cm->reference_mode = REFERENCE_MODE_SELECT;
 
     if (cm->interp_filter == SWITCHABLE) {
       if (frame_type != ALTREF_FRAME &&
@@ -3065,9 +3429,6 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       }
     }
 
-    cpi->mb.e_mbd.lossless = cpi->oxcf.lossless;
-    cm->reference_mode = reference_mode;
-
     encode_frame_internal(cpi);
 
     for (i = 0; i < REFERENCE_MODES; ++i) {
@@ -3146,10 +3507,8 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       }
     }
   } else {
-    cpi->mb.e_mbd.lossless = cpi->oxcf.lossless;
     cm->reference_mode = SINGLE_REFERENCE;
-    // Force the usage of the BILINEAR interp_filter.
-    cm->interp_filter = BILINEAR;
+    cm->interp_filter = SWITCHABLE;
     encode_frame_internal(cpi);
   }
 }
@@ -3214,19 +3573,20 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO **mi_8x8 = xd->mi_8x8;
+  MODE_INFO **mi_8x8 = xd->mi;
   MODE_INFO *mi = mi_8x8[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
   PICK_MODE_CONTEXT *ctx = get_block_context(x, bsize);
   unsigned int segment_id = mbmi->segment_id;
-  const int mis = cm->mode_info_stride;
+  const int mis = cm->mi_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
 
   x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
-                   (cpi->oxcf.aq_mode != COMPLEXITY_AQ &&
-                    cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ) &&
-                   !cpi->sf.use_nonrd_pick_mode;
+                   cpi->oxcf.aq_mode != COMPLEXITY_AQ &&
+                   cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ &&
+                   cpi->sf.allow_skip_recode;
+
   x->skip_optimize = ctx->is_coded;
   ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
@@ -3243,7 +3603,6 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
     }
   } else {
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-    xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
       // Adjust the zbin based on this MB rate.
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.h b/libvpx/vp9/encoder/vp9_encodeframe.h
index 72343cd..131e932 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.h
+++ b/libvpx/vp9/encoder/vp9_encodeframe.h
@@ -20,6 +20,12 @@ struct macroblock;
 struct yv12_buffer_config;
 struct VP9_COMP;
 
+typedef struct {
+  unsigned int sse;
+  int sum;
+  unsigned int var;
+} diff;
+
 void vp9_setup_src_planes(struct macroblock *x,
                           const struct yv12_buffer_config *src,
                           int mi_row, int mi_col);
diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index fae03bf..5e98e4e 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c
@@ -111,7 +111,7 @@ static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *p = &mb->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
-  const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
+  const int ref = is_inter_block(&xd->mi[0]->mbmi);
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
   const int16_t *coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
@@ -139,7 +139,7 @@ static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
   rdmult = mb->rdmult * err_mult;
-  if (!is_inter_block(&mb->e_mbd.mi_8x8[0]->mbmi))
+  if (!is_inter_block(&mb->e_mbd.mi[0]->mbmi))
     rdmult = (rdmult * 9) >> 4;
   rddiv = mb->rddiv;
   /* Initialize the sentinel node of the trellis. */
@@ -452,7 +452,7 @@ void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct encode_b_args arg = {x, &ctx, &mbmi->skip};
   int plane;
 
@@ -477,7 +477,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
@@ -562,7 +562,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
     case TX_4X4:
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
       scan_order = &vp9_scan_orders[TX_4X4][tx_type];
-      mode = plane == 0 ? get_y_mode(xd->mi_8x8[0], block) : mbmi->uv_mode;
+      mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
                               x->skip_encode ? src : dst,
                               x->skip_encode ? src_stride : dst_stride,
@@ -608,14 +608,14 @@ void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  struct encode_b_args arg = {x, NULL, &xd->mi_8x8[0]->mbmi.skip};
+  struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip};
 
   vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra,
                                          &arg);
 }
 
 int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) {
-  MB_MODE_INFO * mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
+  MB_MODE_INFO * mbmi = &x->e_mbd.mi[0]->mbmi;
   x->skip_encode = 0;
   mbmi->mode = DC_PRED;
   mbmi->ref_frame[0] = INTRA_FRAME;
diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c
index 2a10bbf..9d44865 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/libvpx/vp9/encoder/vp9_encodemv.c
@@ -242,7 +242,7 @@ static void inc_mvs(const MB_MODE_INFO *mbmi, const int_mv mvs[2],
 }
 
 void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd) {
-  const MODE_INFO *mi = xd->mi_8x8[0];
+  const MODE_INFO *mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (mbmi->sb_type < BLOCK_8X8) {
diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index c4c219b..db32ef8 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c
@@ -23,6 +23,7 @@
 #include "vp9/common/vp9_reconinter.h"  // vp9_setup_dst_planes()
 #include "vp9/common/vp9_systemdependent.h"
 
+#include "vp9/encoder/vp9_aq_variance.h"
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
@@ -34,7 +35,6 @@
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_vaq.h"
 #include "vp9/encoder/vp9_variance.h"
 
 #define OUTPUT_FPF 0
@@ -54,8 +54,6 @@
 
 #define MIN_KF_BOOST        300
 
-#define DISABLE_RC_LONG_TERM_MEM 0
-
 #if CONFIG_MULTIPLE_ARF
 // Set MIN_GF_INTERVAL to 1 for the full decomposition.
 #define MIN_GF_INTERVAL             2
@@ -63,6 +61,8 @@
 #define MIN_GF_INTERVAL             4
 #endif
 
+#define DISABLE_RC_LONG_TERM_MEM
+
 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
   YV12_BUFFER_CONFIG temp = *a;
   *a = *b;
@@ -257,12 +257,22 @@ static void avg_stats(FIRSTPASS_STATS *section) {
 // harder frames.
 static double calculate_modified_err(const VP9_COMP *cpi,
                                      const FIRSTPASS_STATS *this_frame) {
-  const struct twopass_rc *const twopass = &cpi->twopass;
-  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
-  const double av_err = stats->ssim_weighted_pred_err / stats->count;
-  double modified_error = av_err * pow(this_frame->ssim_weighted_pred_err /
-                                           DOUBLE_DIVIDE_CHECK(av_err),
-                                       cpi->oxcf.two_pass_vbrbias / 100.0);
+  const struct twopass_rc *twopass = &cpi->twopass;
+  const SVC *const svc = &cpi->svc;
+  const FIRSTPASS_STATS *stats;
+  double av_err;
+  double modified_error;
+
+  if (svc->number_spatial_layers > 1 &&
+      svc->number_temporal_layers == 1) {
+    twopass = &svc->layer_context[svc->spatial_layer_id].twopass;
+  }
+
+  stats = &twopass->total_stats;
+  av_err = stats->ssim_weighted_pred_err / stats->count;
+  modified_error = av_err * pow(this_frame->ssim_weighted_pred_err /
+                   DOUBLE_DIVIDE_CHECK(av_err),
+                   cpi->oxcf.two_pass_vbrbias / 100.0);
 
   return fclamp(modified_error,
                 twopass->modified_error_min, twopass->modified_error_max);
@@ -326,15 +336,13 @@ static double simple_weight(const YV12_BUFFER_CONFIG *buf) {
 }
 
 // This function returns the maximum target rate per frame.
-static int frame_max_bits(const VP9_COMP *cpi) {
-  int64_t max_bits =
-    ((int64_t)cpi->rc.av_per_frame_bandwidth *
-     (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
-
+static int frame_max_bits(const RATE_CONTROL *rc, const VP9_CONFIG *oxcf) {
+  int64_t max_bits = ((int64_t)rc->av_per_frame_bandwidth *
+                          (int64_t)oxcf->two_pass_vbrmax_section) / 100;
   if (max_bits < 0)
     max_bits = 0;
-  else if (max_bits > cpi->rc.max_frame_bandwidth)
-    max_bits = cpi->rc.max_frame_bandwidth;
+  else if (max_bits > rc->max_frame_bandwidth)
+    max_bits = rc->max_frame_bandwidth;
 
   return (int)max_bits;
 }
@@ -375,7 +383,7 @@ static unsigned int zz_motion_search(const MACROBLOCK *x) {
   const uint8_t *const ref = xd->plane[0].pre[0].buf;
   const int ref_stride = xd->plane[0].pre[0].stride;
   unsigned int sse;
-  vp9_variance_fn_t fn = get_block_variance_fn(xd->mi_8x8[0]->mbmi.sb_type);
+  vp9_variance_fn_t fn = get_block_variance_fn(xd->mi[0]->mbmi.sb_type);
   fn(src, src_stride, ref, ref_stride, &sse);
   return sse;
 }
@@ -389,7 +397,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int num00, tmp_err, n, sr = 0;
   int step_param = 3;
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-  const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
   int new_mv_mode_penalty = 256;
   const int quart_frm = MIN(cpi->common.width, cpi->common.height);
@@ -533,8 +541,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
   vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
   vp9_setup_dst_planes(xd, new_yv12, 0, 0);
 
-  xd->mi_8x8 = cm->mi_grid_visible;
-  xd->mi_8x8[0] = cm->mi;
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
 
   vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
@@ -582,8 +590,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
       xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
       xd->left_available = (mb_col != 0);
-      xd->mi_8x8[0]->mbmi.sb_type = bsize;
-      xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+      xd->mi[0]->mbmi.sb_type = bsize;
+      xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
       set_mi_row_col(xd, &tile,
                      mb_row << 1, num_8x8_blocks_high_lookup[bsize],
                      mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
@@ -702,11 +710,11 @@ void vp9_first_pass(VP9_COMP *cpi) {
           mv.as_mv.row *= 8;
           mv.as_mv.col *= 8;
           this_error = motion_error;
-          xd->mi_8x8[0]->mbmi.mode = NEWMV;
-          xd->mi_8x8[0]->mbmi.mv[0] = mv;
-          xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
-          xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
-          xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE;
+          xd->mi[0]->mbmi.mode = NEWMV;
+          xd->mi[0]->mbmi.mv[0] = mv;
+          xd->mi[0]->mbmi.tx_size = TX_4X4;
+          xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
+          xd->mi[0]->mbmi.ref_frame[1] = NONE;
           vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
           vp9_encode_sby_pass1(x, bsize);
           sum_mvr += mv.as_mv.row;
@@ -902,21 +910,21 @@ int vp9_twopass_worst_quality(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats,
 
   const double section_err = fpstats->coded_error / fpstats->count;
   const double err_per_mb = section_err / num_mbs;
+  const double speed_term = 1.0 + ((double)cpi->speed * 0.04);
 
   if (section_target_bandwitdh <= 0)
     return rc->worst_quality;          // Highest value allowed
 
-  target_norm_bits_per_mb = section_target_bandwitdh < (1 << 20)
-                              ? (512 * section_target_bandwitdh) / num_mbs
-                              : 512 * (section_target_bandwitdh / num_mbs);
+  target_norm_bits_per_mb =
+      ((uint64_t)section_target_bandwitdh << BPER_MB_NORMBITS) / num_mbs;
 
   // Try and pick a max Q that will be high enough to encode the
   // content at the given rate.
   for (q = rc->best_quality; q < rc->worst_quality; ++q) {
     const double err_correction_factor = calc_correction_factor(err_per_mb,
                                              ERR_DIVISOR, 0.5, 0.90, q);
-    const int bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
-                                                         err_correction_factor);
+    const int bits_per_mb_at_this_q =
+      vp9_rc_bits_per_mb(INTER_FRAME, q, (err_correction_factor * speed_term));
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
   }
@@ -931,10 +939,18 @@ int vp9_twopass_worst_quality(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats,
 extern void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
 void vp9_init_second_pass(VP9_COMP *cpi) {
+  SVC *const svc = &cpi->svc;
   FIRSTPASS_STATS this_frame;
   const FIRSTPASS_STATS *start_pos;
-  struct twopass_rc *const twopass = &cpi->twopass;
+  struct twopass_rc *twopass = &cpi->twopass;
   const VP9_CONFIG *const oxcf = &cpi->oxcf;
+  const int is_spatial_svc = (svc->number_spatial_layers > 1) &&
+                             (svc->number_temporal_layers == 1);
+  double frame_rate;
+
+  if (is_spatial_svc) {
+    twopass = &svc->layer_context[svc->spatial_layer_id].twopass;
+  }
 
   zero_stats(&twopass->total_stats);
   zero_stats(&twopass->total_left_stats);
@@ -945,30 +961,44 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   twopass->total_stats = *twopass->stats_in_end;
   twopass->total_left_stats = twopass->total_stats;
 
+  frame_rate = 10000000.0 * twopass->total_stats.count /
+               twopass->total_stats.duration;
   // Each frame can have a different duration, as the frame rate in the source
   // isn't guaranteed to be constant. The frame rate prior to the first frame
   // encoded in the second pass is a guess. However, the sum duration is not.
   // It is calculated based on the actual durations of all frames from the
   // first pass.
-  vp9_new_framerate(cpi, 10000000.0 * twopass->total_stats.count /
-                        twopass->total_stats.duration);
+
+  if (is_spatial_svc) {
+    vp9_update_spatial_layer_framerate(cpi, frame_rate);
+    twopass->bits_left =
+        (int64_t)(twopass->total_stats.duration *
+        svc->layer_context[svc->spatial_layer_id].target_bandwidth /
+        10000000.0);
+  } else {
+    vp9_new_framerate(cpi, frame_rate);
+    twopass->bits_left = (int64_t)(twopass->total_stats.duration *
+                                   oxcf->target_bandwidth / 10000000.0);
+  }
 
   cpi->output_framerate = oxcf->framerate;
-  twopass->bits_left = (int64_t)(twopass->total_stats.duration *
-                                 oxcf->target_bandwidth / 10000000.0);
 
   // Calculate a minimum intra value to be used in determining the IIratio
   // scores used in the second pass. We have this minimum to make sure
   // that clips that are static but "low complexity" in the intra domain
   // are still boosted appropriately for KF/GF/ARF.
-  twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
-  twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
+  if (!is_spatial_svc) {
+    // We don't know the number of MBs for each layer at this point.
+    // So we will do it later.
+    twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
+    twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
+  }
 
   // This variable monitors how far behind the second ref update is lagging.
   twopass->sr_update_lag = 1;
 
-  // Scan the first pass file and calculate an average Intra / Inter error score
-  // ratio for the sequence.
+  // Scan the first pass file and calculate an average Intra / Inter error
+  // score ratio for the sequence.
   {
     double sum_iiratio = 0.0;
     start_pos = twopass->stats_in;
@@ -1027,8 +1057,8 @@ static double get_prediction_decay_rate(const VP9_COMMON *cm,
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(VP9_COMP *cpi, int frame_interval,
-                                      int still_interval,
+static int detect_transition_to_still(struct twopass_rc *twopass,
+                                      int frame_interval, int still_interval,
                                       double loop_decay_rate,
                                       double last_decay_rate) {
   int trans_to_still = 0;
@@ -1040,19 +1070,19 @@ static int detect_transition_to_still(VP9_COMP *cpi, int frame_interval,
       loop_decay_rate >= 0.999 &&
       last_decay_rate < 0.9) {
     int j;
-    const FIRSTPASS_STATS *position = cpi->twopass.stats_in;
+    const FIRSTPASS_STATS *position = twopass->stats_in;
     FIRSTPASS_STATS tmp_next_frame;
 
     // Look ahead a few frames to see if static condition persists...
     for (j = 0; j < still_interval; ++j) {
-      if (EOF == input_stats(&cpi->twopass, &tmp_next_frame))
+      if (EOF == input_stats(twopass, &tmp_next_frame))
         break;
 
       if (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion < 0.999)
         break;
     }
 
-    reset_fpf_position(&cpi->twopass, position);
+    reset_fpf_position(twopass, position);
 
     // Only if it does do we signal a transition to still.
     if (j == still_interval)
@@ -1374,9 +1404,11 @@ void define_fixed_arf_period(VP9_COMP *cpi) {
 
 // Analyse and define a gf/arf group.
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  VP9_CONFIG *const oxcf = &cpi->oxcf;
+  struct twopass_rc *const twopass = &cpi->twopass;
   FIRSTPASS_STATS next_frame = { 0 };
   const FIRSTPASS_STATS *start_pos;
-  struct twopass_rc *const twopass = &cpi->twopass;
   int i;
   double boost_score = 0.0;
   double old_boost_score = 0.0;
@@ -1395,16 +1427,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
   double mv_ratio_accumulator_thresh;
-  const int max_bits = frame_max_bits(cpi);  // Max bits for a single frame.
-
-  unsigned int allow_alt_ref = cpi->oxcf.play_alternate &&
-                               cpi->oxcf.lag_in_frames;
+  // Max bits for a single frame.
+  const int max_bits = frame_max_bits(rc, oxcf);
+  unsigned int allow_alt_ref = oxcf->play_alternate && oxcf->lag_in_frames;
 
   int f_boost = 0;
   int b_boost = 0;
   int flash_detected;
   int active_max_gf_interval;
-  RATE_CONTROL *const rc = &cpi->rc;
 
   twopass->gf_group_bits = 0;
 
@@ -1476,7 +1506,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
-      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
+      if (detect_transition_to_still(twopass, i, 5, loop_decay_rate,
                                      last_loop_decay_rate)) {
         allow_alt_ref = 0;
         break;
@@ -1615,8 +1645,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // Calculate the bits to be allocated to the group as a whole.
   if (twopass->kf_group_bits > 0 && twopass->kf_group_error_left > 0) {
-    twopass->gf_group_bits = (int64_t)(cpi->twopass.kf_group_bits *
-                (gf_group_err / cpi->twopass.kf_group_error_left));
+    twopass->gf_group_bits = (int64_t)(twopass->kf_group_bits *
+                (gf_group_err / twopass->kf_group_error_left));
   } else {
     twopass->gf_group_bits = 0;
   }
@@ -1705,10 +1735,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   {
     // Adjust KF group bits and error remaining.
     twopass->kf_group_error_left -= (int64_t)gf_group_err;
-    twopass->kf_group_bits -= twopass->gf_group_bits;
-
-    if (twopass->kf_group_bits < 0)
-      twopass->kf_group_bits = 0;
 
     // If this is an arf update we want to remove the score for the overlay
     // frame at the end which will usually be very cheap to code.
@@ -1725,11 +1751,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       twopass->gf_group_error_left = (int64_t)gf_group_err;
     }
 
-    twopass->gf_group_bits -= twopass->gf_bits;
-
-    if (twopass->gf_group_bits < 0)
-      twopass->gf_group_bits = 0;
-
     // This condition could fail if there are two kfs very close together
     // despite MIN_GF_INTERVAL and would cause a divide by 0 in the
     // calculation of alt_extra_bits.
@@ -1738,8 +1759,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
       if (boost >= 150) {
         const int pct_extra = MIN(20, (boost - 100) / 50);
-        const int alt_extra_bits = (int)((twopass->gf_group_bits * pct_extra) /
-                                       100);
+        const int alt_extra_bits = (int)((
+            MAX(twopass->gf_group_bits - twopass->gf_bits, 0) *
+            pct_extra) / 100);
         twopass->gf_group_bits -= alt_extra_bits;
       }
     }
@@ -1768,40 +1790,36 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
 // Allocate bits to a normal frame that is neither a gf an arf or a key frame.
 static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  struct twopass_rc *twopass = &cpi->twopass;
+  // For a single frame.
+  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+  // Calculate modified prediction error used in bit allocation.
+  const double modified_err = calculate_modified_err(cpi, this_frame);
   int target_frame_size;
-  double modified_err;
   double err_fraction;
-  const int max_bits = frame_max_bits(cpi);  // Max for a single frame.
-
-  // Calculate modified prediction error used in bit allocation.
-  modified_err = calculate_modified_err(cpi, this_frame);
 
-  if (cpi->twopass.gf_group_error_left > 0)
+  if (twopass->gf_group_error_left > 0)
     // What portion of the remaining GF group error is used by this frame.
-    err_fraction = modified_err / cpi->twopass.gf_group_error_left;
+    err_fraction = modified_err / twopass->gf_group_error_left;
   else
     err_fraction = 0.0;
 
   // How many of those bits available for allocation should we give it?
-  target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);
+  target_frame_size = (int)((double)twopass->gf_group_bits * err_fraction);
 
   // Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at
   // the top end.
   target_frame_size = clamp(target_frame_size, 0,
-                            MIN(max_bits, (int)cpi->twopass.gf_group_bits));
+                            MIN(max_bits, (int)twopass->gf_group_bits));
 
   // Adjust error and bits remaining.
-  cpi->twopass.gf_group_error_left -= (int64_t)modified_err;
-  cpi->twopass.gf_group_bits -= target_frame_size;
-
-  if (cpi->twopass.gf_group_bits < 0)
-    cpi->twopass.gf_group_bits = 0;
+  twopass->gf_group_error_left -= (int64_t)modified_err;
 
   // Per frame bit target for this frame.
   vp9_rc_set_frame_target(cpi, target_frame_size);
 }
 
-static int test_candidate_kf(VP9_COMP *cpi,
+static int test_candidate_kf(struct twopass_rc *twopass,
                              const FIRSTPASS_STATS *last_frame,
                              const FIRSTPASS_STATS *this_frame,
                              const FIRSTPASS_STATS *next_frame) {
@@ -1822,7 +1840,7 @@ static int test_candidate_kf(VP9_COMP *cpi,
          ((next_frame->intra_error /
            DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) {
     int i;
-    const FIRSTPASS_STATS *start_pos = cpi->twopass.stats_in;
+    const FIRSTPASS_STATS *start_pos = twopass->stats_in;
     FIRSTPASS_STATS local_next_frame = *next_frame;
     double boost_score = 0.0;
     double old_boost_score = 0.0;
@@ -1859,7 +1877,7 @@ static int test_candidate_kf(VP9_COMP *cpi,
       old_boost_score = boost_score;
 
       // Get the next frame details
-      if (EOF == input_stats(&cpi->twopass, &local_next_frame))
+      if (EOF == input_stats(twopass, &local_next_frame))
         break;
     }
 
@@ -1869,7 +1887,7 @@ static int test_candidate_kf(VP9_COMP *cpi,
       is_viable_kf = 1;
     } else {
       // Reset the file position
-      reset_fpf_position(&cpi->twopass, start_pos);
+      reset_fpf_position(twopass, start_pos);
 
       is_viable_kf = 0;
     }
@@ -1882,16 +1900,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int i, j;
   RATE_CONTROL *const rc = &cpi->rc;
   struct twopass_rc *const twopass = &cpi->twopass;
-  FIRSTPASS_STATS last_frame;
   const FIRSTPASS_STATS first_frame = *this_frame;
-  FIRSTPASS_STATS next_frame;
   const FIRSTPASS_STATS *start_position = twopass->stats_in;
-
+  FIRSTPASS_STATS next_frame;
+  FIRSTPASS_STATS last_frame;
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
-  double boost_score = 0;
-  double loop_decay_rate;
-
+  double boost_score = 0.0;
   double kf_mod_err = 0.0;
   double kf_group_err = 0.0;
   double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
@@ -1929,8 +1944,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Provided that we are not at the end of the file...
     if (cpi->oxcf.auto_key &&
         lookup_next_frame_stats(twopass, &next_frame) != EOF) {
+      double loop_decay_rate;
+
       // Check for a scene cut.
-      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
+      if (test_candidate_kf(twopass, &last_frame, this_frame, &next_frame))
         break;
 
       // How fast is the prediction quality decaying?
@@ -1946,7 +1963,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
       // Special check for transition or high motion followed by a
       // static scene.
-      if (detect_transition_to_still(cpi, i, cpi->key_frame_frequency - i,
+      if (detect_transition_to_still(twopass, i, cpi->key_frame_frequency - i,
                                      loop_decay_rate, decay_accumulator))
         break;
 
@@ -1999,7 +2016,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Calculate the number of bits that should be assigned to the kf group.
   if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
     // Maximum number of bits for a single normal frame (not key frame).
-    const int max_bits = frame_max_bits(cpi);
+    const int max_bits = frame_max_bits(rc, &cpi->oxcf);
 
     // Maximum number of bits allocated to the key frame group.
     int64_t max_grp_bits;
@@ -2051,10 +2068,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
       // How fast is prediction quality decaying.
       if (!detect_flash(twopass, 0)) {
-        loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
+        const double loop_decay_rate = get_prediction_decay_rate(&cpi->common,
+                                                                 &next_frame);
         decay_accumulator *= loop_decay_rate;
-        decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
-                              ? MIN_DECAY_FACTOR : decay_accumulator;
+        decay_accumulator = MAX(decay_accumulator, MIN_DECAY_FACTOR);
       }
 
       boost_score += (decay_accumulator * r);
@@ -2085,7 +2102,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   if (1) {
     int kf_boost = (int)boost_score;
     int allocation_chunks;
-    int alt_kf_bits;
 
     if (kf_boost < (rc->frames_to_key * 3))
       kf_boost = (rc->frames_to_key * 3);
@@ -2119,14 +2135,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     // Prevent overflow.
     if (kf_boost > 1028) {
-      int divisor = kf_boost >> 10;
+      const int divisor = kf_boost >> 10;
       kf_boost /= divisor;
       allocation_chunks /= divisor;
     }
 
-    twopass->kf_group_bits = (twopass->kf_group_bits < 0) ? 0
-           : twopass->kf_group_bits;
-
+    twopass->kf_group_bits = MAX(0, twopass->kf_group_bits);
     // Calculate the number of bits to be spent on the key frame.
     twopass->kf_bits = (int)((double)kf_boost *
         ((double)twopass->kf_group_bits / allocation_chunks));
@@ -2136,11 +2150,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // then use an alternate calculation based on the kf error score
     // which should give a smaller key frame.
     if (kf_mod_err < kf_group_err / rc->frames_to_key) {
-      double  alt_kf_grp_bits = ((double)twopass->bits_left *
+      double alt_kf_grp_bits = ((double)twopass->bits_left *
          (kf_mod_err * (double)rc->frames_to_key) /
          DOUBLE_DIVIDE_CHECK(twopass->modified_error_left));
 
-      alt_kf_bits = (int)((double)kf_boost *
+      const int alt_kf_bits = (int)((double)kf_boost *
                           (alt_kf_grp_bits / (double)allocation_chunks));
 
       if (twopass->kf_bits > alt_kf_bits)
@@ -2149,12 +2163,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       // Else if it is much harder than other frames in the group make sure
       // it at least receives an allocation in keeping with its relative
       // error score.
-      alt_kf_bits = (int)((double)twopass->bits_left * (kf_mod_err /
+      const int alt_kf_bits = (int)((double)twopass->bits_left * (kf_mod_err /
                DOUBLE_DIVIDE_CHECK(twopass->modified_error_left)));
 
-      if (alt_kf_bits > twopass->kf_bits) {
+      if (alt_kf_bits > twopass->kf_bits)
         twopass->kf_bits = alt_kf_bits;
-      }
     }
     twopass->kf_group_bits -= twopass->kf_bits;
     // Per frame bit target for this frame.
@@ -2187,14 +2200,24 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   struct twopass_rc *const twopass = &cpi->twopass;
-  const int frames_left = (int)(twopass->total_stats.count -
-                              cm->current_video_frame);
+  int frames_left;
   FIRSTPASS_STATS this_frame;
   FIRSTPASS_STATS this_frame_copy;
 
   double this_frame_intra_error;
   double this_frame_coded_error;
   int target;
+  LAYER_CONTEXT *lc = NULL;
+  int is_spatial_svc = (cpi->use_svc && cpi->svc.number_temporal_layers == 1);
+
+  if (is_spatial_svc) {
+    lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
+    frames_left = (int)(twopass->total_stats.count -
+                  lc->current_video_frame_in_layer);
+  } else {
+    frames_left = (int)(twopass->total_stats.count -
+                  cm->current_video_frame);
+  }
 
   if (!twopass->stats_in)
     return;
@@ -2207,9 +2230,15 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   vp9_clear_system_state();
 
+  if (is_spatial_svc && twopass->kf_intra_err_min == 0) {
+    twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
+    twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
+  }
+
   if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
     twopass->active_worst_quality = cpi->oxcf.cq_level;
-  } else if (cm->current_video_frame == 0) {
+  } else if (cm->current_video_frame == 0 ||
+             (is_spatial_svc && lc->current_video_frame_in_layer == 0)) {
     // Special case code for first frame.
     const int section_target_bandwidth = (int)(twopass->bits_left /
                                                frames_left);
@@ -2232,6 +2261,11 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     // Define next KF group and assign bits to it.
     this_frame_copy = this_frame;
     find_next_key_frame(cpi, &this_frame_copy);
+    // Don't place key frame in any enhancement layers in spatial svc
+    if (cpi->use_svc && cpi->svc.number_temporal_layers == 1 &&
+        cpi->svc.spatial_layer_id > 0) {
+      cm->frame_type = INTER_FRAME;
+    }
   } else {
     cm->frame_type = INTER_FRAME;
   }
@@ -2291,23 +2325,24 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   subtract_stats(&twopass->total_left_stats, &this_frame);
 }
 
-void vp9_twopass_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
+void vp9_twopass_postencode_update(VP9_COMP *cpi) {
 #ifdef DISABLE_RC_LONG_TERM_MEM
-  cpi->twopass.bits_left -=  cpi->rc.this_frame_target;
+  const uint64_t bits_used = cpi->rc.this_frame_target;
 #else
-  cpi->twopass.bits_left -= 8 * bytes_used;
+  const uint64_t bits_used = cpi->rc.projected_frame_size;
+#endif
+  cpi->twopass.bits_left -= bits_used;
+  cpi->twopass.bits_left = MAX(cpi->twopass.bits_left, 0);
   // Update bits left to the kf and gf groups to account for overshoot or
   // undershoot on these frames.
-  if (cm->frame_type == KEY_FRAME) {
-    cpi->twopass.kf_group_bits += cpi->rc.this_frame_target -
-        cpi->rc.projected_frame_size;
-
-    cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
-  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
-    cpi->twopass.gf_group_bits += cpi->rc.this_frame_target -
-        cpi->rc.projected_frame_size;
-
+  if (cpi->common.frame_type == KEY_FRAME) {
+    // For key frames kf_group_bits already had the target bits subtracted out.
+    // So now update to the correct value based on the actual bits used.
+    cpi->twopass.kf_group_bits += cpi->rc.this_frame_target - bits_used;
+  } else {
+    cpi->twopass.kf_group_bits -= bits_used;
+    cpi->twopass.gf_group_bits -= bits_used;
     cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
   }
-#endif
+  cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
 }
diff --git a/libvpx/vp9/encoder/vp9_firstpass.h b/libvpx/vp9/encoder/vp9_firstpass.h
index bf7b5a1..7a16c8f 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/libvpx/vp9/encoder/vp9_firstpass.h
@@ -35,7 +35,7 @@ typedef struct {
   double new_mv_count;
   double duration;
   double count;
-  int spatial_layer_id;
+  int64_t spatial_layer_id;
 } FIRSTPASS_STATS;
 
 struct twopass_rc {
@@ -95,8 +95,7 @@ int vp9_twopass_worst_quality(struct VP9_COMP *cpi, FIRSTPASS_STATS *fpstats,
                               int section_target_bandwitdh);
 
 // Post encode update of the rate control parameters for 2-pass
-void vp9_twopass_postencode_update(struct VP9_COMP *cpi,
-                                   uint64_t bytes_used);
+void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vp9/encoder/vp9_lookahead.c b/libvpx/vp9/encoder/vp9_lookahead.c
index a88d5ec..cf03e01 100644
--- a/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/libvpx/vp9/encoder/vp9_lookahead.c
@@ -28,8 +28,8 @@ struct lookahead_ctx {
 
 
 /* Return the buffer at the given absolute index and increment the index */
-static struct lookahead_entry * pop(struct lookahead_ctx *ctx,
-                                    unsigned int *idx) {
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
+                                   unsigned int *idx) {
   unsigned int index = *idx;
   struct lookahead_entry *buf = ctx->buf + index;
 
@@ -55,16 +55,19 @@ void vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
 }
 
 
-struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
-                                          unsigned int height,
-                                          unsigned int subsampling_x,
-                                          unsigned int subsampling_y,
-                                          unsigned int depth) {
+struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
+                                         unsigned int height,
+                                         unsigned int subsampling_x,
+                                         unsigned int subsampling_y,
+                                         unsigned int depth) {
   struct lookahead_ctx *ctx = NULL;
 
   // Clamp the lookahead queue depth
   depth = clamp(depth, 1, MAX_LAG_BUFFERS);
 
+  // Allocate memory to keep previous source frames available.
+  depth += MAX_PRE_FRAMES;
+
   // Allocate the lookahead structures
   ctx = calloc(1, sizeof(*ctx));
   if (ctx) {
@@ -96,7 +99,7 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
   int mb_cols = (src->y_width + 15) >> 4;
 #endif
 
-  if (ctx->sz + 1 > ctx->max_sz)
+  if (ctx->sz + 1  + MAX_PRE_FRAMES > ctx->max_sz)
     return 1;
   ctx->sz++;
   buf = pop(ctx, &ctx->write_idx);
@@ -159,11 +162,11 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
 }
 
 
-struct lookahead_entry * vp9_lookahead_pop(struct lookahead_ctx *ctx,
-                                           int drain) {
+struct lookahead_entry *vp9_lookahead_pop(struct lookahead_ctx *ctx,
+                                          int drain) {
   struct lookahead_entry *buf = NULL;
 
-  if (ctx->sz && (drain || ctx->sz == ctx->max_sz)) {
+  if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
     buf = pop(ctx, &ctx->read_idx);
     ctx->sz--;
   }
@@ -171,16 +174,28 @@ struct lookahead_entry * vp9_lookahead_pop(struct lookahead_ctx *ctx,
 }
 
 
-struct lookahead_entry * vp9_lookahead_peek(struct lookahead_ctx *ctx,
-                                            int index) {
+struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx,
+                                           int index) {
   struct lookahead_entry *buf = NULL;
 
-  if (index < (int)ctx->sz) {
-    index += ctx->read_idx;
-    if (index >= (int)ctx->max_sz)
-      index -= ctx->max_sz;
-    buf = ctx->buf + index;
+  if (index >= 0) {
+    // Forward peek
+    if (index < (int)ctx->sz) {
+      index += ctx->read_idx;
+      if (index >= (int)ctx->max_sz)
+        index -= ctx->max_sz;
+      buf = ctx->buf + index;
+    }
+  } else if (index < 0) {
+    // Backward peek
+    if (-index <= MAX_PRE_FRAMES) {
+      index += ctx->read_idx;
+      if (index < 0)
+        index += ctx->max_sz;
+      buf = ctx->buf + index;
+    }
   }
+
   return buf;
 }
 
diff --git a/libvpx/vp9/encoder/vp9_lookahead.h b/libvpx/vp9/encoder/vp9_lookahead.h
index ff63c0d..046c533 100644
--- a/libvpx/vp9/encoder/vp9_lookahead.h
+++ b/libvpx/vp9/encoder/vp9_lookahead.h
@@ -20,6 +20,9 @@ extern "C" {
 
 #define MAX_LAG_BUFFERS 25
 
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
+
 struct lookahead_entry {
   YV12_BUFFER_CONFIG  img;
   int64_t             ts_start;
diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c
index 6520389..44b171f 100644
--- a/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -61,8 +61,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
         &sse);
   }
 
-  xd->mi_8x8[0]->mbmi.mode = NEWMV;
-  xd->mi_8x8[0]->mbmi.mv[0].as_mv = *dst_mv;
+  xd->mi[0]->mbmi.mode = NEWMV;
+  xd->mi[0]->mbmi.mv[0].as_mv = *dst_mv;
 
   vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
 
@@ -145,7 +145,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi,
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     unsigned int err;
 
-    xd->mi_8x8[0]->mbmi.mode = mode;
+    xd->mi[0]->mbmi.mode = mode;
     vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
                             x->plane[0].src.buf, x->plane[0].src.stride,
                             xd->plane[0].dst.buf, xd->plane[0].dst.stride,
@@ -252,7 +252,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   xd->plane[0].dst.stride  = buf->y_stride;
   xd->plane[0].pre[0].stride  = buf->y_stride;
   xd->plane[1].dst.stride = buf->uv_stride;
-  xd->mi_8x8[0] = &mi_local;
+  xd->mi[0] = &mi_local;
   mi_local.mbmi.sb_type = BLOCK_16X16;
   mi_local.mbmi.ref_frame[0] = LAST_FRAME;
   mi_local.mbmi.ref_frame[1] = NONE;
@@ -370,7 +370,6 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
     else
       cpi->static_mb_pct = 0;
 
-    cpi->seg0_cnt = ncnt[0];
     vp9_enable_segmentation(&cm->seg);
   } else {
     cpi->static_mb_pct = 0;
diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c
index 2ae8a2a..f7a02a4 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libvpx/vp9/encoder/vp9_mcomp.c
@@ -23,6 +23,11 @@
 
 // #define NEW_DIAMOND_SEARCH
 
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+                                             const MV *mv) {
+  return &buf->buf[mv->row * buf->stride + mv->col];
+}
+
 void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv) {
   int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
   int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
@@ -370,9 +375,9 @@ int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x,
   unsigned int sse;
   unsigned int whichdir;
   int thismse;
-  unsigned int halfiters = iters_per_step;
-  unsigned int quarteriters = iters_per_step;
-  unsigned int eighthiters = iters_per_step;
+  const unsigned int halfiters = iters_per_step;
+  const unsigned int quarteriters = iters_per_step;
+  const unsigned int eighthiters = iters_per_step;
 
   DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
   const int y_stride = xd->plane[0].pre[0].stride;
@@ -399,7 +404,7 @@ int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x,
   // calculate central point error
   // TODO(yunqingwang): central pointer error was already calculated in full-
   // pixel search, and can be passed in this function.
-  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+  vp9_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
   besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
@@ -495,8 +500,7 @@ static int vp9_pattern_search(const MACROBLOCK *x,
                               MV *ref_mv,
                               int search_param,
                               int sad_per_bit,
-                              int do_init_search,
-                              int do_refine,
+                              int do_init_search, int do_refine,
                               const vp9_variance_fn_ptr_t *vfp,
                               int use_mvcost,
                               const MV *center_mv, MV *best_mv,
@@ -508,20 +512,15 @@ static int vp9_pattern_search(const MACROBLOCK *x,
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
   };
   int i, j, s, t;
-  const uint8_t *what = x->plane[0].src.buf;
-  const int what_stride = x->plane[0].src.stride;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   int br, bc;
-  MV this_mv;
   int bestsad = INT_MAX;
   int thissad;
-  const uint8_t *base_offset;
-  const uint8_t *this_offset;
   int k = -1;
-  int best_site = -1;
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   int best_init_s = search_param_to_steps[search_param];
-  const int *mvjsadcost = x->nmvjointsadcost;
+  const int *const mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
   // adjust ref_mv to make sure it is within MV range
@@ -530,13 +529,10 @@ static int vp9_pattern_search(const MACROBLOCK *x,
   bc = ref_mv->col;
 
   // Work out the start point for the search
-  base_offset = xd->plane[0].pre[0].buf;
-  this_offset = base_offset + (br * in_what_stride) + bc;
-  this_mv.row = br;
-  this_mv.col = bc;
-  bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff)
-                + mvsad_err_cost(&this_mv, &fcenter_mv,
-                                 mvjsadcost, mvsadcost, sad_per_bit);
+  bestsad = vfp->sdf(what->buf, what->stride,
+                     get_buf_from_mv(in_what, ref_mv), in_what->stride,
+                     0x7fffffff) + mvsad_err_cost(ref_mv, &fcenter_mv,
+                         mvjsadcost, mvsadcost, sad_per_bit);
 
   // Search all possible scales upto the search param around the center point
   // pick the scale of the point that is best as the starting scale of
@@ -545,27 +541,25 @@ static int vp9_pattern_search(const MACROBLOCK *x,
     s = best_init_s;
     best_init_s = -1;
     for (t = 0; t <= s; ++t) {
-      best_site = -1;
+      int best_site = -1;
       if (check_bounds(x, br, bc, 1 << t)) {
         for (i = 0; i < num_candidates[t]; i++) {
-          this_mv.row = br + candidates[t][i].row;
-          this_mv.col = bc + candidates[t][i].col;
-          this_offset = base_offset + (this_mv.row * in_what_stride) +
-                                       this_mv.col;
-          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                             bestsad);
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride, bestsad);
           CHECK_BETTER
         }
       } else {
         for (i = 0; i < num_candidates[t]; i++) {
-          this_mv.row = br + candidates[t][i].row;
-          this_mv.col = bc + candidates[t][i].col;
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
           if (!is_mv_in(x, &this_mv))
             continue;
-          this_offset = base_offset + (this_mv.row * in_what_stride) +
-                                       this_mv.col;
-          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                             bestsad);
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride, bestsad);
           CHECK_BETTER
         }
       }
@@ -585,31 +579,30 @@ static int vp9_pattern_search(const MACROBLOCK *x,
   // If the center point is still the best, just skip this and move to
   // the refinement step.
   if (best_init_s != -1) {
+    int best_site = -1;
     s = best_init_s;
-    best_site = -1;
+
     do {
       // No need to search all 6 points the 1st time if initial search was used
       if (!do_init_search || s != best_init_s) {
         if (check_bounds(x, br, bc, 1 << s)) {
           for (i = 0; i < num_candidates[s]; i++) {
-            this_mv.row = br + candidates[s][i].row;
-            this_mv.col = bc + candidates[s][i].col;
-            this_offset = base_offset + (this_mv.row * in_what_stride) +
-                                         this_mv.col;
-            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                               bestsad);
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride, bestsad);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < num_candidates[s]; i++) {
-            this_mv.row = br + candidates[s][i].row;
-            this_mv.col = bc + candidates[s][i].col;
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
             if (!is_mv_in(x, &this_mv))
               continue;
-            this_offset = base_offset + (this_mv.row * in_what_stride) +
-                                         this_mv.col;
-            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                               bestsad);
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride, bestsad);
             CHECK_BETTER
           }
         }
@@ -632,24 +625,22 @@ static int vp9_pattern_search(const MACROBLOCK *x,
 
         if (check_bounds(x, br, bc, 1 << s)) {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
-            this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col;
-            this_offset = base_offset + (this_mv.row * (in_what_stride)) +
-                                         this_mv.col;
-            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                               bestsad);
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride, bestsad);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
-            this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col;
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
             if (!is_mv_in(x, &this_mv))
               continue;
-            this_offset = base_offset + (this_mv.row * (in_what_stride)) +
-                                         this_mv.col;
-            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                               bestsad);
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride, bestsad);
             CHECK_BETTER
           }
         }
@@ -666,29 +657,28 @@ static int vp9_pattern_search(const MACROBLOCK *x,
   // Check 4 1-away neighbors if do_refine is true.
   // For most well-designed schemes do_refine will not be necessary.
   if (do_refine) {
-    static const MV neighbors[4] = { {0, -1}, { -1, 0}, {1, 0}, {0, 1} };
+    static const MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
+
     for (j = 0; j < 16; j++) {
-      best_site = -1;
+      int best_site = -1;
       if (check_bounds(x, br, bc, 1)) {
         for (i = 0; i < 4; i++) {
-          this_mv.row = br + neighbors[i].row;
-          this_mv.col = bc + neighbors[i].col;
-          this_offset = base_offset + this_mv.row * in_what_stride +
-                            this_mv.col;
-          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                             bestsad);
+          const MV this_mv = {br + neighbors[i].row,
+                              bc + neighbors[i].col};
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride, bestsad);
           CHECK_BETTER
         }
       } else {
         for (i = 0; i < 4; i++) {
-          this_mv.row = br + neighbors[i].row;
-          this_mv.col = bc + neighbors[i].col;
+          const MV this_mv = {br + neighbors[i].row,
+                              bc + neighbors[i].col};
           if (!is_mv_in(x, &this_mv))
             continue;
-          this_offset = base_offset + this_mv.row * in_what_stride +
-                            this_mv.col;
-          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                             bestsad);
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride, bestsad);
           CHECK_BETTER
         }
       }
@@ -705,8 +695,6 @@ static int vp9_pattern_search(const MACROBLOCK *x,
   best_mv->row = br;
   best_mv->col = bc;
 
-  this_mv.row = best_mv->row * 8;
-  this_mv.col = best_mv->col * 8;
   return bestsad;
 }
 
@@ -714,41 +702,32 @@ int vp9_get_mvpred_var(const MACROBLOCK *x,
                        const MV *best_mv, const MV *center_mv,
                        const vp9_variance_fn_ptr_t *vfp,
                        int use_mvcost) {
-  unsigned int unused;
-
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const uint8_t *what = x->plane[0].src.buf;
-  const int what_stride = x->plane[0].src.stride;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
-  const uint8_t *base_offset = xd->plane[0].pre[0].buf;
-  const uint8_t *this_offset = &base_offset[best_mv->row * in_what_stride +
-                                            best_mv->col];
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV mv = {best_mv->row * 8, best_mv->col * 8};
-  return vfp->vf(what, what_stride, this_offset, in_what_stride, &unused) +
+  unsigned int unused;
+
+  return vfp->vf(what->buf, what->stride,
+                 get_buf_from_mv(in_what, best_mv), in_what->stride, &unused) +
       (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
                                  x->mvcost, x->errorperbit) : 0);
 }
 
 int vp9_get_mvpred_av_var(const MACROBLOCK *x,
-                          MV *best_mv,
-                          const MV *center_mv,
+                          const MV *best_mv, const MV *center_mv,
                           const uint8_t *second_pred,
                           const vp9_variance_fn_ptr_t *vfp,
                           int use_mvcost) {
-  unsigned int bestsad;
-  MV this_mv;
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const uint8_t *what = x->plane[0].src.buf;
-  const int what_stride = x->plane[0].src.stride;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
-  const uint8_t *base_offset = xd->plane[0].pre[0].buf;
-  const uint8_t *this_offset = base_offset + (best_mv->row * in_what_stride) +
-      best_mv->col;
-  this_mv.row = best_mv->row * 8;
-  this_mv.col = best_mv->col * 8;
-  return vfp->svaf(this_offset, in_what_stride, 0, 0, what, what_stride,
-                   &bestsad, second_pred) +
-      (use_mvcost ?  mv_err_cost(&this_mv, center_mv, x->nmvjointcost,
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  unsigned int unused;
+
+  return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+                   what->buf, what->stride, &unused, second_pred) +
+      (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
                                  x->mvcost, x->errorperbit) : 0);
 }
 
@@ -908,7 +887,6 @@ int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
   const int what_stride = x->plane[0].src.stride;
   const uint8_t *in_what;
   const int in_what_stride = xd->plane[0].pre[0].stride;
-  MV this_mv;
 
   unsigned int bestsad = INT_MAX;
   int ref_row, ref_col;
@@ -960,8 +938,7 @@ int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
 
         for (i = 0; i < 4; ++i) {
           if (sad_array[i] < bestsad) {
-            this_mv.row = ref_row + tr;
-            this_mv.col = ref_col + tc + i;
+            const MV this_mv = {ref_row + tr, ref_col + tc + i};
             thissad = sad_array[i] +
                       mvsad_err_cost(&this_mv, &fcenter_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
@@ -979,8 +956,7 @@ int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
                                 bestsad);
 
           if (thissad < bestsad) {
-            this_mv.row = ref_row + tr;
-            this_mv.col = ref_col + tc + i;
+            const MV this_mv = {ref_row + tr, ref_col + tc + i};
             thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
 
@@ -1005,66 +981,49 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
                              const vp9_variance_fn_ptr_t *fn_ptr,
                              int *mvjcost, int *mvcost[2],
                              const MV *center_mv) {
-  int i, j, step;
-
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const uint8_t *what = x->plane[0].src.buf;
-  const int what_stride = x->plane[0].src.stride;
-  const uint8_t *in_what;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
-  const uint8_t *best_address;
-
-  int bestsad = INT_MAX;
-  int best_site = 0;
-  int last_site = 0;
-
-  int ref_row, ref_col;
-
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   // search_param determines the length of the initial step and hence the number
   // of iterations
   // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
   // (MAX_FIRST_STEP/4) pel... etc.
   const search_site *const ss = &x->ss[search_param * x->searches_per_step];
   const int tot_steps = (x->ss_count / x->searches_per_step) - search_param;
-
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-
   const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+  const uint8_t *best_address;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+  int i, j, step;
 
   clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  ref_row = ref_mv->row;
-  ref_col = ref_mv->col;
+  best_address = get_buf_from_mv(in_what, ref_mv);
   *num00 = 0;
-  best_mv->row = ref_row;
-  best_mv->col = ref_col;
-
-  // Work out the start point for the search
-  in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
-  best_address = in_what;
+  *best_mv = *ref_mv;
 
   // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
-                + mvsad_err_cost(best_mv, &fcenter_mv,
-                                 mvjsadcost, mvsadcost, sad_per_bit);
+  best_sad = fn_ptr->sdf(what->buf, what->stride,
+                        in_what->buf, in_what->stride, 0x7fffffff) +
+      mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, sad_per_bit);
 
   i = 1;
 
   for (step = 0; step < tot_steps; step++) {
     for (j = 0; j < x->searches_per_step; j++) {
-      const MV this_mv = {best_mv->row + ss[i].mv.row,
-                          best_mv->col + ss[i].mv.col};
-      if (is_mv_in(x, &this_mv)) {
-        const uint8_t *const check_here = ss[i].offset + best_address;
-        int thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                              bestsad);
-
-        if (thissad < bestsad) {
-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                    mvjsadcost, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
+      const MV mv = {best_mv->row + ss[i].mv.row,
+                     best_mv->col + ss[i].mv.col};
+      if (is_mv_in(x, &mv)) {
+       int sad = fn_ptr->sdf(what->buf, what->stride,
+                             best_address + ss[i].offset, in_what->stride,
+                             best_sad);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(&mv, &fcenter_mv, mvjsadcost, mvsadcost,
+                                sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
             best_site = i;
           }
         }
@@ -1083,14 +1042,14 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
         const MV this_mv = {best_mv->row + ss[best_site].mv.row,
                             best_mv->col + ss[best_site].mv.col};
         if (is_mv_in(x, &this_mv)) {
-          const uint8_t *const check_here = ss[best_site].offset + best_address;
-          int thissad = fn_ptr->sdf(what, what_stride, check_here,
-                                    in_what_stride, bestsad);
-          if (thissad < bestsad) {
-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                      mvjsadcost, mvsadcost, sad_per_bit);
-            if (thissad < bestsad) {
-              bestsad = thissad;
+          int sad = fn_ptr->sdf(what->buf, what->stride,
+                                best_address + ss[best_site].offset,
+                                in_what->stride, best_sad);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                  mvjsadcost, mvsadcost, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
               best_mv->row += ss[best_site].mv.row;
               best_mv->col += ss[best_site].mv.col;
               best_address += ss[best_site].offset;
@@ -1101,11 +1060,11 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
         break;
       };
 #endif
-    } else if (best_address == in_what) {
+    } else if (best_address == in_what->buf) {
       (*num00)++;
     }
   }
-  return bestsad;
+  return best_sad;
 }
 
 int vp9_diamond_search_sadx4(const MACROBLOCK *x,
@@ -1331,10 +1290,8 @@ int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
                           const MV *center_mv, MV *best_mv) {
   int r, c;
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const uint8_t *const what = x->plane[0].src.buf;
-  const int what_stride = x->plane[0].src.stride;
-  const uint8_t *const in_what = xd->plane[0].pre[0].buf;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
   const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
   const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
@@ -1342,25 +1299,22 @@ int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
   const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-  const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride +
-                                         ref_mv->col];
-  int best_sad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride,
-                             0x7fffffff) +
+  int best_sad = fn_ptr->sdf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
       mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, sad_per_bit);
   *best_mv = *ref_mv;
 
   for (r = row_min; r < row_max; ++r) {
     for (c = col_min; c < col_max; ++c) {
-      const MV this_mv = {r, c};
-      const uint8_t *check_here = &in_what[r * in_what_stride + c];
-      const int sad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                                  best_sad) +
-          mvsad_err_cost(&this_mv, &fcenter_mv,
-                         mvjsadcost, mvsadcost, sad_per_bit);
+      const MV mv = {r, c};
+      const int sad = fn_ptr->sdf(what->buf, what->stride,
+          get_buf_from_mv(in_what, &mv), in_what->stride, best_sad) +
+          mvsad_err_cost(&mv, &fcenter_mv, mvjsadcost, mvsadcost,
+                         sad_per_bit);
 
       if (sad < best_sad) {
         best_sad = sad;
-        *best_mv = this_mv;
+        *best_mv = mv;
       }
     }
   }
@@ -1472,7 +1426,6 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
   MV this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
-  unsigned int thissad;
   int ref_row = ref_mv->row;
   int ref_col = ref_mv->col;
 
@@ -1512,7 +1465,7 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
       fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
 
       for (i = 0; i < 8; i++) {
-        thissad = (unsigned int)sad_array8[i];
+        unsigned int thissad = (unsigned int)sad_array8[i];
 
         if (thissad < bestsad) {
           this_mv.col = c;
@@ -1537,12 +1490,12 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
       fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
 
       for (i = 0; i < 3; i++) {
-        thissad = sad_array[i];
+        unsigned int thissad = sad_array[i];
 
         if (thissad < bestsad) {
           this_mv.col = c;
-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                     mvjsadcost, mvsadcost, sad_per_bit);
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                    mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
             bestsad = thissad;
@@ -1557,8 +1510,8 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
     }
 
     while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                            bestsad);
+      unsigned int thissad = fn_ptr->sdf(what, what_stride,
+                                         check_here, in_what_stride, bestsad);
 
       if (thissad < bestsad) {
         this_mv.col = c;
@@ -1585,41 +1538,34 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               int *mvjcost, int *mvcost[2],
                               const MV *center_mv) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
-  int i, j;
-
-  const int what_stride = x->plane[0].src.stride;
-  const uint8_t *const what = x->plane[0].src.buf;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
-  const uint8_t *const in_what = xd->plane[0].pre[0].buf;
-  const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride +
-                                             ref_mv->col];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  unsigned int bestsad = fn_ptr->sdf(what, what_stride, best_address,
-                                     in_what_stride, 0x7fffffff) +
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
+                                     get_buf_from_mv(in_what, ref_mv),
+                                     in_what->stride, 0x7fffffff) +
       mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+  int i, j;
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
 
     for (j = 0; j < 4; j++) {
-      const MV this_mv = {ref_mv->row + neighbors[j].row,
-                          ref_mv->col + neighbors[j].col};
-      if (is_mv_in(x, &this_mv)) {
-        const uint8_t *check_here = &in_what[this_mv.row * in_what_stride +
-                                                this_mv.col];
-        unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
-                                           in_what_stride, bestsad);
-        if (thissad < bestsad) {
-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                    mvjsadcost, mvsadcost, error_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+            get_buf_from_mv(in_what, &mv), in_what->stride, best_sad);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(&mv, &fcenter_mv, mvjsadcost, mvsadcost,
+                                error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
             best_site = j;
           }
         }
@@ -1633,7 +1579,7 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x,
       ref_mv->col += neighbors[best_site].col;
     }
   }
-  return bestsad;
+  return best_sad;
 }
 
 int vp9_refining_search_sadx4(const MACROBLOCK *x,
@@ -1643,74 +1589,64 @@ int vp9_refining_search_sadx4(const MACROBLOCK *x,
                               int *mvjcost, int *mvcost[2],
                               const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
-  int i, j;
-
-  const int what_stride = x->plane[0].src.stride;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
-  const uint8_t *what = x->plane[0].src.buf;
-  const uint8_t *best_address = xd->plane[0].pre[0].buf +
-                          (ref_mv->row * xd->plane[0].pre[0].stride) +
-                          ref_mv->col;
-
+  const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-
   const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  unsigned int bestsad = fn_ptr->sdf(what, what_stride, best_address,
-                                    in_what_stride, 0x7fffffff) +
+  const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, best_address,
+                                    in_what->stride, 0x7fffffff) +
       mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+  int i, j;
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
-    int all_in = ((ref_mv->row - 1) > x->mv_row_min) &
-                 ((ref_mv->row + 1) < x->mv_row_max) &
-                 ((ref_mv->col - 1) > x->mv_col_min) &
-                 ((ref_mv->col + 1) < x->mv_col_max);
+    const int all_in = ((ref_mv->row - 1) > x->mv_row_min) &
+                       ((ref_mv->row + 1) < x->mv_row_max) &
+                       ((ref_mv->col - 1) > x->mv_col_min) &
+                       ((ref_mv->col + 1) < x->mv_col_max);
 
     if (all_in) {
-      unsigned int sad_array[4];
-      uint8_t const *block_offset[4] = {
-        best_address - in_what_stride,
+      unsigned int sads[4];
+      const uint8_t *const positions[4] = {
+        best_address - in_what->stride,
         best_address - 1,
         best_address + 1,
-        best_address + in_what_stride
+        best_address + in_what->stride
       };
 
-      fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
-                     sad_array);
+      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
 
-      for (j = 0; j < 4; j++) {
-        if (sad_array[j] < bestsad) {
-          const MV this_mv = {ref_mv->row + neighbors[j].row,
-                              ref_mv->col + neighbors[j].col};
-          sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv,
+      for (j = 0; j < 4; ++j) {
+        if (sads[j] < best_sad) {
+          const MV mv = {ref_mv->row + neighbors[j].row,
+                         ref_mv->col + neighbors[j].col};
+          sads[j] += mvsad_err_cost(&mv, &fcenter_mv,
                                          mvjsadcost, mvsadcost, error_per_bit);
 
-          if (sad_array[j] < bestsad) {
-            bestsad = sad_array[j];
+          if (sads[j] < best_sad) {
+            best_sad = sads[j];
             best_site = j;
           }
         }
       }
     } else {
-      for (j = 0; j < 4; j++) {
-        const MV this_mv = {ref_mv->row + neighbors[j].row,
-                            ref_mv->col + neighbors[j].col};
-
-        if (is_mv_in(x, &this_mv)) {
-          const uint8_t *check_here = neighbors[j].row * in_what_stride +
-                                      neighbors[j].col + best_address;
-          unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
-                                             in_what_stride, bestsad);
-
-          if (thissad < bestsad) {
-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                      mvjsadcost, mvsadcost, error_per_bit);
-
-            if (thissad < bestsad) {
-              bestsad = thissad;
+      for (j = 0; j < 4; ++j) {
+        const MV mv = {ref_mv->row + neighbors[j].row,
+                       ref_mv->col + neighbors[j].col};
+
+        if (is_mv_in(x, &mv)) {
+          unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                                         get_buf_from_mv(in_what, &mv),
+                                         in_what->stride, best_sad);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(&mv, &fcenter_mv,
+                                  mvjsadcost, mvsadcost, error_per_bit);
+
+            if (sad < best_sad) {
+              best_sad = sad;
               best_site = j;
             }
           }
@@ -1723,12 +1659,11 @@ int vp9_refining_search_sadx4(const MACROBLOCK *x,
     } else {
       ref_mv->row += neighbors[best_site].row;
       ref_mv->col += neighbors[best_site].col;
-      best_address += (neighbors[best_site].row) * in_what_stride +
-                      neighbors[best_site].col;
+      best_address = get_buf_from_mv(in_what, ref_mv);
     }
   }
 
-  return bestsad;
+  return best_sad;
 }
 
 // This function is called when we do joint motion search in comp_inter_inter
@@ -1740,48 +1675,36 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x,
                              int *mvjcost, int *mvcost[2],
                              const MV *center_mv,
                              const uint8_t *second_pred, int w, int h) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
                            {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
-  int i, j;
-
-  const uint8_t *what = x->plane[0].src.buf;
-  const int what_stride = x->plane[0].src.stride;
-  const uint8_t *in_what = xd->plane[0].pre[0].buf;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
-  const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride +
-                                             ref_mv->col];
-  unsigned int thissad;
-  MV this_mv;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-
   const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  /* Get compound pred by averaging two pred blocks. */
-  unsigned int bestsad = fn_ptr->sdaf(what, what_stride,
-                                      best_address, in_what_stride,
-                                      second_pred, 0x7fffffff) +
+  unsigned int best_sad = fn_ptr->sdaf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride,
+      second_pred, 0x7fffffff) +
       mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+  int i, j;
 
   for (i = 0; i < search_range; ++i) {
     int best_site = -1;
 
-    for (j = 0; j < 8; j++) {
-      this_mv.row = ref_mv->row + neighbors[j].row;
-      this_mv.col = ref_mv->col + neighbors[j].col;
-
-      if (is_mv_in(x, &this_mv)) {
-        const uint8_t *check_here = &in_what[this_mv.row * in_what_stride +
-                                                this_mv.col];
+    for (j = 0; j < 8; ++j) {
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
 
-        thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride,
-                               second_pred, bestsad);
-        if (thissad < bestsad) {
-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->sdaf(what->buf, what->stride,
+            get_buf_from_mv(in_what, &mv), in_what->stride,
+            second_pred, best_sad);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(&mv, &fcenter_mv,
                                     mvjsadcost, mvsadcost, error_per_bit);
-          if (thissad < bestsad) {
-            bestsad = thissad;
+          if (sad < best_sad) {
+            best_sad = sad;
             best_site = j;
           }
         }
@@ -1795,5 +1718,5 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x,
       ref_mv->col += neighbors[best_site].col;
     }
   }
-  return bestsad;
+  return best_sad;
 }
diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h
index 917de75..f7b7c5e 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/libvpx/vp9/encoder/vp9_mcomp.h
@@ -42,8 +42,7 @@ int vp9_get_mvpred_var(const MACROBLOCK *x,
                        const vp9_variance_fn_ptr_t *vfp,
                        int use_mvcost);
 int vp9_get_mvpred_av_var(const MACROBLOCK *x,
-                          MV *best_mv,
-                          const MV *center_mv,
+                          const MV *best_mv, const MV *center_mv,
                           const uint8_t *second_pred,
                           const vp9_variance_fn_ptr_t *vfp,
                           int use_mvcost);
diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c
index cccc1a9..3619ec8 100644
--- a/libvpx/vp9/encoder/vp9_onyx_if.c
+++ b/libvpx/vp9/encoder/vp9_onyx_if.c
@@ -27,8 +27,10 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
 
+#include "vp9/encoder/vp9_aq_complexity.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_aq_variance.h"
 #include "vp9/encoder/vp9_bitstream.h"
-#include "vp9/encoder/vp9_craq.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_firstpass.h"
@@ -38,17 +40,14 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_speed_features.h"
+#if CONFIG_INTERNAL_STATS
+#include "vp9/encoder/vp9_ssim.h"
+#endif
 #include "vp9/encoder/vp9_temporal_filter.h"
-#include "vp9/encoder/vp9_vaq.h"
 #include "vp9/encoder/vp9_resize.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 
-#define ALL_INTRA_MODES 0x3FF
-#define INTRA_DC_ONLY 0x01
-#define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED))
-#define INTRA_DC_H_V ((1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED))
-#define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED))
-
 void vp9_coef_tree_initialize();
 
 #define DEFAULT_INTERP_FILTER SWITCHABLE
@@ -62,30 +61,11 @@ void vp9_coef_tree_initialize();
                                          // now so that HIGH_PRECISION is always
                                          // chosen.
 
-// Masks for partially or completely disabling split mode
-#define DISABLE_ALL_SPLIT         0x3F
-#define DISABLE_ALL_INTER_SPLIT   0x1F
-#define DISABLE_COMPOUND_SPLIT    0x18
-#define LAST_AND_INTRA_SPLIT_ONLY 0x1E
-
 // Max rate target for 1080P and below encodes under normal circumstances
 // (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
 #define MAX_MB_RATE 250
 #define MAXRATE_1080P 2025000
 
-#if CONFIG_INTERNAL_STATS
-extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,
-                            YV12_BUFFER_CONFIG *dest, int lumamask,
-                            double *weight);
-
-
-extern double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source,
-                             YV12_BUFFER_CONFIG *dest, double *ssim_y,
-                             double *ssim_u, double *ssim_v);
-
-
-#endif
-
 // #define OUTPUT_YUV_REC
 
 #ifdef OUTPUT_YUV_SRC
@@ -103,9 +83,6 @@ FILE *keyfile;
 
 void vp9_init_quantizer(VP9_COMP *cpi);
 
-static const double in_frame_q_adj_ratio[MAX_SEGMENTS] =
-  {1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-
 static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -144,17 +121,33 @@ static void set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) {
   }
 }
 
+static void setup_key_frame(VP9_COMP *cpi) {
+  vp9_setup_past_independence(&cpi->common);
+
+  // All buffers are implicitly updated on key frames.
+  cpi->refresh_golden_frame = 1;
+  cpi->refresh_alt_ref_frame = 1;
+}
+
+static void setup_inter_frame(VP9_COMMON *cm) {
+  if (cm->error_resilient_mode || cm->intra_only)
+    vp9_setup_past_independence(cm);
+
+  assert(cm->frame_context_idx < FRAME_CONTEXTS);
+  cm->fc = cm->frame_contexts[cm->frame_context_idx];
+}
+
 void vp9_initialize_enc() {
   static int init_done = 0;
 
   if (!init_done) {
-    vp9_initialize_common();
+    vp9_init_neighbors();
+    vp9_init_quant_tables();
+
     vp9_coef_tree_initialize();
     vp9_tokenize_initialize();
-    vp9_init_quant_tables();
     vp9_init_me_luts();
     vp9_rc_init_minq_luts();
-    // init_base_skip_probs();
     vp9_entropy_mv_init();
     vp9_entropy_mode_init();
     init_done = 1;
@@ -163,6 +156,7 @@ void vp9_initialize_enc() {
 
 static void dealloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
+  int i;
 
   // Delete sementation map
   vpx_free(cpi->segmentation_map);
@@ -173,16 +167,19 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   cpi->coding_context.last_frame_seg_map_copy = NULL;
 
   vpx_free(cpi->complexity_map);
-  cpi->complexity_map = 0;
-  vpx_free(cpi->cyclic_refresh.map);
-  cpi->cyclic_refresh.map = 0;
+  cpi->complexity_map = NULL;
+
+  vp9_cyclic_refresh_free(cpi->cyclic_refresh);
+  cpi->cyclic_refresh = NULL;
+
   vpx_free(cpi->active_map);
-  cpi->active_map = 0;
+  cpi->active_map = NULL;
 
   vp9_free_frame_buffers(cm);
 
   vp9_free_frame_buffer(&cpi->last_frame_uf);
   vp9_free_frame_buffer(&cpi->scaled_source);
+  vp9_free_frame_buffer(&cpi->scaled_last_source);
   vp9_free_frame_buffer(&cpi->alt_ref_buffer);
   vp9_lookahead_destroy(cpi->lookahead);
 
@@ -195,103 +192,59 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vpx_free(cpi->mb_norm_activity_map);
   cpi->mb_norm_activity_map = 0;
 
-  vpx_free(cpi->above_context[0]);
-  cpi->above_context[0] = NULL;
-
-  vpx_free(cpi->above_seg_context);
-  cpi->above_seg_context = NULL;
-}
-
-// Computes a q delta (in "q index" terms) to get from a starting q value
-// to a target q value
-int vp9_compute_qdelta(const VP9_COMP *cpi, double qstart, double qtarget) {
-  const RATE_CONTROL *const rc = &cpi->rc;
-  int start_index = rc->worst_quality;
-  int target_index = rc->worst_quality;
-  int i;
-
-  // Convert the average q value to an index.
-  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
-    start_index = i;
-    if (vp9_convert_qindex_to_q(i) >= qstart)
-      break;
+  for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
+    LAYER_CONTEXT *const lc = &cpi->svc.layer_context[i];
+    vpx_free(lc->rc_twopass_stats_in.buf);
+    lc->rc_twopass_stats_in.buf = NULL;
+    lc->rc_twopass_stats_in.sz = 0;
   }
-
-  // Convert the q target to an index
-  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
-    target_index = i;
-    if (vp9_convert_qindex_to_q(i) >= qtarget)
-      break;
-  }
-
-  return target_index - start_index;
 }
 
-// Computes a q delta (in "q index" terms) to get from a starting q value
-// to a value that should equate to the given rate ratio.
-int vp9_compute_qdelta_by_rate(VP9_COMP *cpi, int base_q_index,
-                               double rate_target_ratio) {
-  int i;
-  int target_index = cpi->rc.worst_quality;
+static void save_coding_context(VP9_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  VP9_COMMON *cm = &cpi->common;
 
-  // Look up the current projected bits per block for the base index
-  const int base_bits_per_mb = vp9_rc_bits_per_mb(cpi->common.frame_type,
-                                            base_q_index, 1.0);
+  // Stores a snapshot of key state variables which can subsequently be
+  // restored with a call to vp9_restore_coding_context. These functions are
+  // intended for use in a re-code loop in vp9_compress_frame where the
+  // quantizer value is adjusted between loop iterations.
+  vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
+  vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);
+  vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);
 
-  // Find the target bits per mb based on the base value and given ratio.
-  const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+  vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
 
-  // Convert the q target to an index
-  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; ++i) {
-    target_index = i;
-    if (vp9_rc_bits_per_mb(cpi->common.frame_type, i, 1.0) <=
-            target_bits_per_mb )
-      break;
-  }
+  vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
+             cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
 
-  return target_index - base_q_index;
-}
-
-// This function sets up a set of segments with delta Q values around
-// the baseline frame quantizer.
-static void setup_in_frame_q_adj(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  struct segmentation *const seg = &cm->seg;
-
-  // Make SURE use of floating point in this function is safe.
-  vp9_clear_system_state();
+  vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
+  vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
 
-  if (cm->frame_type == KEY_FRAME ||
-      cpi->refresh_alt_ref_frame ||
-      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
-    int segment;
+  cc->fc = cm->fc;
+}
 
-    // Clear down the segment map
-    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+static void restore_coding_context(VP9_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  VP9_COMMON *cm = &cpi->common;
 
-    // Clear down the complexity map used for rd
-    vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
+  // Restore key state variables to the snapshot state stored in the
+  // previous call to vp9_save_coding_context.
+  vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
+  vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
+  vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
 
-    vp9_enable_segmentation(seg);
-    vp9_clearall_segfeatures(seg);
+  vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
 
-    // Select delta coding method
-    seg->abs_delta = SEGMENT_DELTADATA;
+  vpx_memcpy(cm->last_frame_seg_map,
+             cpi->coding_context.last_frame_seg_map_copy,
+             (cm->mi_rows * cm->mi_cols));
 
-    // Segment 0 "Q" feature is disabled so it defaults to the baseline Q
-    vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
+  vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
+  vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
 
-    // Use some of the segments for in frame Q adjustment
-    for (segment = 1; segment < 2; segment++) {
-      const int qindex_delta =
-          vp9_compute_qdelta_by_rate(cpi,
-                                     cm->base_qindex,
-                                     in_frame_q_adj_ratio[segment]);
-      vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
-      vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
-    }
-  }
+  cm->fc = cc->fc;
 }
+
 static void configure_static_seg_features(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
@@ -335,7 +288,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
       seg->update_map = 1;
       seg->update_data = 1;
 
-      qi_delta = vp9_compute_qdelta(cpi, rc->avg_q, rc->avg_q * 0.875);
+      qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875);
       vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
       vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
 
@@ -356,7 +309,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
         seg->update_data = 1;
         seg->abs_delta = SEGMENT_DELTADATA;
 
-        qi_delta = vp9_compute_qdelta(cpi, rc->avg_q, rc->avg_q * 1.125);
+        qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125);
         vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
         vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
 
@@ -446,7 +399,7 @@ static void update_reference_segmentation_map(VP9_COMP *cpi) {
     uint8_t *cache = cache_ptr;
     for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
       cache[0] = mi_8x8[0]->mbmi.segment_id;
-    mi_8x8_ptr += cm->mode_info_stride;
+    mi_8x8_ptr += cm->mi_stride;
     cache_ptr += cm->mi_cols;
   }
 }
@@ -455,557 +408,137 @@ static int is_slowest_mode(int mode) {
 }
 
 static void set_rd_speed_thresholds(VP9_COMP *cpi) {
-  SPEED_FEATURES *sf = &cpi->sf;
   int i;
 
   // Set baseline threshold values
   for (i = 0; i < MAX_MODES; ++i)
-    sf->thresh_mult[i] = is_slowest_mode(cpi->oxcf.mode) ? -500 : 0;
-
-  sf->thresh_mult[THR_NEARESTMV] = 0;
-  sf->thresh_mult[THR_NEARESTG] = 0;
-  sf->thresh_mult[THR_NEARESTA] = 0;
-
-  sf->thresh_mult[THR_DC] += 1000;
-
-  sf->thresh_mult[THR_NEWMV] += 1000;
-  sf->thresh_mult[THR_NEWA] += 1000;
-  sf->thresh_mult[THR_NEWG] += 1000;
-
-  sf->thresh_mult[THR_NEARMV] += 1000;
-  sf->thresh_mult[THR_NEARA] += 1000;
-  sf->thresh_mult[THR_COMP_NEARESTLA] += 1000;
-  sf->thresh_mult[THR_COMP_NEARESTGA] += 1000;
-
-  sf->thresh_mult[THR_TM] += 1000;
-
-  sf->thresh_mult[THR_COMP_NEARLA] += 1500;
-  sf->thresh_mult[THR_COMP_NEWLA] += 2000;
-  sf->thresh_mult[THR_NEARG] += 1000;
-  sf->thresh_mult[THR_COMP_NEARGA] += 1500;
-  sf->thresh_mult[THR_COMP_NEWGA] += 2000;
-
-  sf->thresh_mult[THR_ZEROMV] += 2000;
-  sf->thresh_mult[THR_ZEROG] += 2000;
-  sf->thresh_mult[THR_ZEROA] += 2000;
-  sf->thresh_mult[THR_COMP_ZEROLA] += 2500;
-  sf->thresh_mult[THR_COMP_ZEROGA] += 2500;
-
-  sf->thresh_mult[THR_H_PRED] += 2000;
-  sf->thresh_mult[THR_V_PRED] += 2000;
-  sf->thresh_mult[THR_D45_PRED ] += 2500;
-  sf->thresh_mult[THR_D135_PRED] += 2500;
-  sf->thresh_mult[THR_D117_PRED] += 2500;
-  sf->thresh_mult[THR_D153_PRED] += 2500;
-  sf->thresh_mult[THR_D207_PRED] += 2500;
-  sf->thresh_mult[THR_D63_PRED] += 2500;
+  cpi->rd_thresh_mult[i] = is_slowest_mode(cpi->oxcf.mode) ? -500 : 0;
+
+  cpi->rd_thresh_mult[THR_NEARESTMV] = 0;
+  cpi->rd_thresh_mult[THR_NEARESTG] = 0;
+  cpi->rd_thresh_mult[THR_NEARESTA] = 0;
+
+  cpi->rd_thresh_mult[THR_DC] += 1000;
+
+  cpi->rd_thresh_mult[THR_NEWMV] += 1000;
+  cpi->rd_thresh_mult[THR_NEWA] += 1000;
+  cpi->rd_thresh_mult[THR_NEWG] += 1000;
+
+  cpi->rd_thresh_mult[THR_NEARMV] += 1000;
+  cpi->rd_thresh_mult[THR_NEARA] += 1000;
+  cpi->rd_thresh_mult[THR_COMP_NEARESTLA] += 1000;
+  cpi->rd_thresh_mult[THR_COMP_NEARESTGA] += 1000;
+
+  cpi->rd_thresh_mult[THR_TM] += 1000;
+
+  cpi->rd_thresh_mult[THR_COMP_NEARLA] += 1500;
+  cpi->rd_thresh_mult[THR_COMP_NEWLA] += 2000;
+  cpi->rd_thresh_mult[THR_NEARG] += 1000;
+  cpi->rd_thresh_mult[THR_COMP_NEARGA] += 1500;
+  cpi->rd_thresh_mult[THR_COMP_NEWGA] += 2000;
+
+  cpi->rd_thresh_mult[THR_ZEROMV] += 2000;
+  cpi->rd_thresh_mult[THR_ZEROG] += 2000;
+  cpi->rd_thresh_mult[THR_ZEROA] += 2000;
+  cpi->rd_thresh_mult[THR_COMP_ZEROLA] += 2500;
+  cpi->rd_thresh_mult[THR_COMP_ZEROGA] += 2500;
+
+  cpi->rd_thresh_mult[THR_H_PRED] += 2000;
+  cpi->rd_thresh_mult[THR_V_PRED] += 2000;
+  cpi->rd_thresh_mult[THR_D45_PRED ] += 2500;
+  cpi->rd_thresh_mult[THR_D135_PRED] += 2500;
+  cpi->rd_thresh_mult[THR_D117_PRED] += 2500;
+  cpi->rd_thresh_mult[THR_D153_PRED] += 2500;
+  cpi->rd_thresh_mult[THR_D207_PRED] += 2500;
+  cpi->rd_thresh_mult[THR_D63_PRED] += 2500;
 
   /* disable frame modes if flags not set */
   if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
-    sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
-    sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
-    sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_NEWMV    ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_NEARESTMV] = INT_MAX;
+    cpi->rd_thresh_mult[THR_ZEROMV   ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_NEARMV   ] = INT_MAX;
   }
   if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
-    sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
-    sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARG    ] = INT_MAX;
-    sf->thresh_mult[THR_NEWG     ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_NEARESTG ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_ZEROG    ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_NEARG    ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_NEWG     ] = INT_MAX;
   }
   if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
-    sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
-    sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARA    ] = INT_MAX;
-    sf->thresh_mult[THR_NEWA     ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_NEARESTA ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_ZEROA    ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_NEARA    ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_NEWA     ] = INT_MAX;
   }
 
   if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
       (VP9_LAST_FLAG | VP9_ALT_FLAG)) {
-    sf->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
+    cpi->rd_thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;
   }
   if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
       (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
-    sf->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
+    cpi->rd_thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
+    cpi->rd_thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
   }
 }
 
 static void set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
-  SPEED_FEATURES *sf = &cpi->sf;
+  const SPEED_FEATURES *const sf = &cpi->sf;
   int i;
 
   for (i = 0; i < MAX_REFS; ++i)
-    sf->thresh_mult_sub8x8[i] = is_slowest_mode(cpi->oxcf.mode)  ? -500 : 0;
+    cpi->rd_thresh_mult_sub8x8[i] = is_slowest_mode(cpi->oxcf.mode)  ? -500 : 0;
 
-  sf->thresh_mult_sub8x8[THR_LAST] += 2500;
-  sf->thresh_mult_sub8x8[THR_GOLD] += 2500;
-  sf->thresh_mult_sub8x8[THR_ALTR] += 2500;
-  sf->thresh_mult_sub8x8[THR_INTRA] += 2500;
-  sf->thresh_mult_sub8x8[THR_COMP_LA] += 4500;
-  sf->thresh_mult_sub8x8[THR_COMP_GA] += 4500;
+  cpi->rd_thresh_mult_sub8x8[THR_LAST] += 2500;
+  cpi->rd_thresh_mult_sub8x8[THR_GOLD] += 2500;
+  cpi->rd_thresh_mult_sub8x8[THR_ALTR] += 2500;
+  cpi->rd_thresh_mult_sub8x8[THR_INTRA] += 2500;
+  cpi->rd_thresh_mult_sub8x8[THR_COMP_LA] += 4500;
+  cpi->rd_thresh_mult_sub8x8[THR_COMP_GA] += 4500;
 
   // Check for masked out split cases.
-  for (i = 0; i < MAX_REFS; i++) {
+  for (i = 0; i < MAX_REFS; i++)
     if (sf->disable_split_mask & (1 << i))
-      sf->thresh_mult_sub8x8[i] = INT_MAX;
-  }
+      cpi->rd_thresh_mult_sub8x8[i] = INT_MAX;
 
   // disable mode test if frame flag is not set
   if (!(cpi->ref_frame_flags & VP9_LAST_FLAG))
-    sf->thresh_mult_sub8x8[THR_LAST] = INT_MAX;
+    cpi->rd_thresh_mult_sub8x8[THR_LAST] = INT_MAX;
   if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG))
-    sf->thresh_mult_sub8x8[THR_GOLD] = INT_MAX;
+    cpi->rd_thresh_mult_sub8x8[THR_GOLD] = INT_MAX;
   if (!(cpi->ref_frame_flags & VP9_ALT_FLAG))
-    sf->thresh_mult_sub8x8[THR_ALTR] = INT_MAX;
+    cpi->rd_thresh_mult_sub8x8[THR_ALTR] = INT_MAX;
   if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
       (VP9_LAST_FLAG | VP9_ALT_FLAG))
-    sf->thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX;
+    cpi->rd_thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX;
   if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
       (VP9_GOLD_FLAG | VP9_ALT_FLAG))
-    sf->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX;
+    cpi->rd_thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX;
 }
 
-static void set_good_speed_feature(VP9_COMMON *cm,
-                                   SPEED_FEATURES *sf,
-                                   int speed) {
-  int i;
-  sf->adaptive_rd_thresh = 1;
-  sf->recode_loop = ((speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW);
-  if (speed == 1) {
-    sf->use_square_partition_only = !frame_is_intra_only(cm);
-    sf->less_rectangular_check  = 1;
-    sf->tx_size_search_method = frame_is_intra_only(cm)
-      ? USE_FULL_RD : USE_LARGESTALL;
-
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = cm->show_frame ?
-        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-    else
-      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
-
-    sf->use_rd_breakout = 1;
-    sf->adaptive_motion_search = 1;
-    sf->adaptive_pred_interp_filter = 1;
-    sf->auto_mv_step_size = 1;
-    sf->adaptive_rd_thresh = 2;
-    sf->recode_loop = ALLOW_RECODE_KFARFGF;
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
-  }
-  if (speed == 2) {
-    sf->use_square_partition_only = !frame_is_intra_only(cm);
-    sf->less_rectangular_check  = 1;
-    sf->tx_size_search_method = frame_is_intra_only(cm)
-      ? USE_FULL_RD : USE_LARGESTALL;
-
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = cm->show_frame ?
-        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-    else
-      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                 FLAG_SKIP_INTRA_BESTINTER |
-                                 FLAG_SKIP_COMP_BESTINTRA |
-                                 FLAG_SKIP_INTRA_LOWVAR;
-    sf->use_rd_breakout = 1;
-    sf->adaptive_motion_search = 1;
-    sf->adaptive_pred_interp_filter = 2;
-    sf->reference_masking = 1;
-    sf->auto_mv_step_size = 1;
-
-    sf->disable_filter_search_var_thresh = 50;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
-    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
-    sf->adjust_partitioning_from_last_frame = 1;
-    sf->last_partitioning_redo_frequency = 3;
-
-    sf->adaptive_rd_thresh = 2;
-    sf->recode_loop = ALLOW_RECODE_KFARFGF;
-    sf->use_lp32x32fdct = 1;
-    sf->mode_skip_start = 11;
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
-  }
-  if (speed == 3) {
-    sf->use_square_partition_only = 1;
-    sf->tx_size_search_method = USE_LARGESTALL;
-
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = DISABLE_ALL_SPLIT;
-    else
-      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
-
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-      FLAG_SKIP_INTRA_BESTINTER |
-      FLAG_SKIP_COMP_BESTINTRA |
-      FLAG_SKIP_INTRA_LOWVAR;
-
-    sf->use_rd_breakout = 1;
-    sf->adaptive_motion_search = 1;
-    sf->adaptive_pred_interp_filter = 2;
-    sf->reference_masking = 1;
-    sf->auto_mv_step_size = 1;
-
-    sf->disable_split_var_thresh = 32;
-    sf->disable_filter_search_var_thresh = 100;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
-    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
-    sf->adjust_partitioning_from_last_frame = 1;
-    sf->last_partitioning_redo_frequency = 3;
-
-    sf->use_uv_intra_rd_estimate = 1;
-    sf->skip_encode_sb = 1;
-    sf->use_lp32x32fdct = 1;
-    sf->subpel_iters_per_step = 1;
-    sf->use_fast_coef_updates = 2;
-    sf->use_fast_coef_costing = 1;
-
-    sf->adaptive_rd_thresh = 4;
-    sf->mode_skip_start = 6;
-  }
-  if (speed == 4) {
-    sf->use_square_partition_only = 1;
-    sf->tx_size_search_method = USE_LARGESTALL;
-    sf->disable_split_mask = DISABLE_ALL_SPLIT;
-
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-      FLAG_SKIP_INTRA_BESTINTER |
-      FLAG_SKIP_COMP_BESTINTRA |
-      FLAG_SKIP_COMP_REFMISMATCH |
-      FLAG_SKIP_INTRA_LOWVAR |
-      FLAG_EARLY_TERMINATE;
-
-    sf->use_rd_breakout = 1;
-    sf->adaptive_motion_search = 1;
-    sf->adaptive_pred_interp_filter = 2;
-    sf->reference_masking = 1;
-    sf->auto_mv_step_size = 1;
-
-    sf->disable_split_var_thresh = 64;
-    sf->disable_filter_search_var_thresh = 200;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
-    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
-    sf->adjust_partitioning_from_last_frame = 1;
-    sf->last_partitioning_redo_frequency = 3;
-
-    sf->use_uv_intra_rd_estimate = 1;
-    sf->skip_encode_sb = 1;
-    sf->use_lp32x32fdct = 1;
-    sf->subpel_iters_per_step = 1;
-    sf->use_fast_coef_updates = 2;
-    sf->use_fast_coef_costing = 1;
-
-    sf->adaptive_rd_thresh = 4;
-    sf->mode_skip_start = 6;
-  }
-  if (speed >= 5) {
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-    sf->partition_search_type = FIXED_PARTITION;
-    sf->tx_size_search_method = frame_is_intra_only(cm) ?
-      USE_FULL_RD : USE_LARGESTALL;
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                 FLAG_SKIP_INTRA_BESTINTER |
-                                 FLAG_SKIP_COMP_BESTINTRA |
-                                 FLAG_SKIP_COMP_REFMISMATCH |
-                                 FLAG_SKIP_INTRA_LOWVAR |
-                                 FLAG_EARLY_TERMINATE;
-    sf->use_rd_breakout = 1;
-    sf->use_lp32x32fdct = 1;
-    sf->optimize_coefficients = 0;
-    sf->auto_mv_step_size = 1;
-    sf->reference_masking = 1;
-
-    sf->disable_split_mask = DISABLE_ALL_SPLIT;
-    sf->search_method = HEX;
-    sf->subpel_iters_per_step = 1;
-    sf->disable_split_var_thresh = 64;
-    sf->disable_filter_search_var_thresh = 500;
-    for (i = 0; i < TX_SIZES; i++) {
-      sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
-      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
-    }
-    sf->use_fast_coef_updates = 2;
-    sf->use_fast_coef_costing = 1;
-    sf->adaptive_rd_thresh = 4;
-    sf->mode_skip_start = 6;
-  }
-}
-
-static void set_rt_speed_feature(VP9_COMMON *cm,
-                                 SPEED_FEATURES *sf,
-                                 int speed) {
-  sf->static_segmentation = 0;
-  sf->adaptive_rd_thresh = 1;
-  sf->recode_loop = ((speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW);
-  sf->encode_breakout_thresh = 1;
-  sf->use_fast_coef_costing = 1;
-
-  if (speed == 1) {
-    sf->use_square_partition_only = !frame_is_intra_only(cm);
-    sf->less_rectangular_check = 1;
-    sf->tx_size_search_method =
-        frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
-
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = cm->show_frame ?
-        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-    else
-      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
-
-    sf->use_rd_breakout = 1;
-    sf->adaptive_motion_search = 1;
-    sf->adaptive_pred_interp_filter = 1;
-    sf->auto_mv_step_size = 1;
-    sf->adaptive_rd_thresh = 2;
-    sf->recode_loop = ALLOW_RECODE_KFARFGF;
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->encode_breakout_thresh = 8;
-  }
-  if (speed >= 2) {
-    sf->use_square_partition_only = !frame_is_intra_only(cm);
-    sf->less_rectangular_check = 1;
-    sf->tx_size_search_method =
-        frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
-
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = cm->show_frame ?
-        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-    else
-      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
-        | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
-        | FLAG_SKIP_INTRA_LOWVAR;
-
-    sf->use_rd_breakout = 1;
-    sf->adaptive_motion_search = 1;
-    sf->adaptive_pred_interp_filter = 2;
-    sf->auto_mv_step_size = 1;
-    sf->reference_masking = 1;
-
-    sf->disable_filter_search_var_thresh = 50;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
-    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
-    sf->adjust_partitioning_from_last_frame = 1;
-    sf->last_partitioning_redo_frequency = 3;
-
-    sf->adaptive_rd_thresh = 2;
-    sf->recode_loop = ALLOW_RECODE_KFARFGF;
-    sf->use_lp32x32fdct = 1;
-    sf->mode_skip_start = 11;
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->encode_breakout_thresh = 200;
-  }
-  if (speed >= 3) {
-    sf->use_square_partition_only = 1;
-    sf->tx_size_search_method = USE_LARGESTALL;
-
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = DISABLE_ALL_SPLIT;
-    else
-      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
-
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
-        | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
-        | FLAG_SKIP_INTRA_LOWVAR;
-
-    sf->disable_filter_search_var_thresh = 100;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
-    sf->use_uv_intra_rd_estimate = 1;
-    sf->skip_encode_sb = 1;
-    sf->subpel_iters_per_step = 1;
-    sf->use_fast_coef_updates = 2;
-    sf->adaptive_rd_thresh = 4;
-    sf->mode_skip_start = 6;
-    sf->encode_breakout_thresh = 400;
-  }
-  if (speed >= 4) {
-    sf->optimize_coefficients = 0;
-    sf->disable_split_mask = DISABLE_ALL_SPLIT;
-    sf->use_fast_lpf_pick = 2;
-    sf->encode_breakout_thresh = 700;
-  }
-  if (speed >= 5) {
-    int i;
-    sf->last_partitioning_redo_frequency = 4;
-    sf->adaptive_rd_thresh = 5;
-    sf->use_fast_coef_costing = 0;
-    sf->auto_min_max_partition_size = STRICT_NEIGHBORING_MIN_MAX;
-    sf->adjust_partitioning_from_last_frame =
-        cm->last_frame_type != cm->frame_type || (0 ==
-        (cm->current_video_frame + 1) % sf->last_partitioning_redo_frequency);
-    sf->subpel_force_stop = 1;
-    for (i = 0; i < TX_SIZES; i++) {
-      sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
-      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
-    }
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_ONLY;
-    sf->frame_parameter_update = 0;
-    sf->encode_breakout_thresh = 1000;
-    sf->search_method = FAST_HEX;
-    sf->disable_inter_mode_mask[BLOCK_32X32] = 1 << INTER_OFFSET(ZEROMV);
-    sf->disable_inter_mode_mask[BLOCK_32X64] = ~(1 << INTER_OFFSET(NEARESTMV));
-    sf->disable_inter_mode_mask[BLOCK_64X32] = ~(1 << INTER_OFFSET(NEARESTMV));
-    sf->disable_inter_mode_mask[BLOCK_64X64] = ~(1 << INTER_OFFSET(NEARESTMV));
-    sf->max_intra_bsize = BLOCK_32X32;
-  }
-  if (speed >= 6) {
-    sf->partition_check =
-        (cm->current_video_frame % sf->last_partitioning_redo_frequency == 1);
-    sf->partition_search_type = REFERENCE_PARTITION;
-    sf->use_nonrd_pick_mode = 1;
-    sf->search_method = FAST_DIAMOND;
-  }
-  if (speed >= 7) {
-    sf->partition_search_type = VAR_BASED_FIXED_PARTITION;
-    sf->use_nonrd_pick_mode = 1;
-    sf->search_method = FAST_DIAMOND;
-  }
-  if (speed >= 8) {
-    int i;
-    for (i = 0; i < BLOCK_SIZES; ++i)
-      sf->disable_inter_mode_mask[i] = 14;    // only search NEARESTMV (0)
-  }
-}
-
-void vp9_set_speed_features(VP9_COMP *cpi) {
-  SPEED_FEATURES *sf = &cpi->sf;
-  VP9_COMMON *cm = &cpi->common;
-  int speed = cpi->speed;
-  int i;
-
-  // Convert negative speed to positive
-  if (speed < 0)
-    speed = -speed;
-
+static void set_speed_features(VP9_COMP *cpi) {
 #if CONFIG_INTERNAL_STATS
+  int i;
   for (i = 0; i < MAX_MODES; ++i)
     cpi->mode_chosen_counts[i] = 0;
 #endif
 
-  // best quality defaults
-  sf->frame_parameter_update = 1;
-  sf->search_method = NSTEP;
-  sf->recode_loop = ALLOW_RECODE;
-  sf->subpel_search_method = SUBPEL_TREE;
-  sf->subpel_iters_per_step = 2;
-  sf->subpel_force_stop = 0;
-  sf->optimize_coefficients = !cpi->oxcf.lossless;
-  sf->reduce_first_step_size = 0;
-  sf->auto_mv_step_size = 0;
-  sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
-  sf->adaptive_rd_thresh = 0;
-  sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF;
-  sf->tx_size_search_method = USE_FULL_RD;
-  sf->use_lp32x32fdct = 0;
-  sf->adaptive_motion_search = 0;
-  sf->adaptive_pred_interp_filter = 0;
-  sf->reference_masking = 0;
-  sf->partition_search_type = SEARCH_PARTITION;
-  sf->less_rectangular_check = 0;
-  sf->use_square_partition_only = 0;
-  sf->auto_min_max_partition_size = NOT_IN_USE;
-  sf->max_partition_size = BLOCK_64X64;
-  sf->min_partition_size = BLOCK_4X4;
-  sf->adjust_partitioning_from_last_frame = 0;
-  sf->last_partitioning_redo_frequency = 4;
-  sf->disable_split_mask = 0;
-  sf->mode_search_skip_flags = 0;
-  sf->disable_split_var_thresh = 0;
-  sf->disable_filter_search_var_thresh = 0;
-  for (i = 0; i < TX_SIZES; i++) {
-    sf->intra_y_mode_mask[i] = ALL_INTRA_MODES;
-    sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES;
-  }
-  sf->use_rd_breakout = 0;
-  sf->skip_encode_sb = 0;
-  sf->use_uv_intra_rd_estimate = 0;
-  sf->use_fast_lpf_pick = 0;
-  sf->use_fast_coef_updates = 0;
-  sf->use_fast_coef_costing = 0;
-  sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
-  sf->use_nonrd_pick_mode = 0;
-  sf->encode_breakout_thresh = 0;
-  for (i = 0; i < BLOCK_SIZES; ++i)
-    sf->disable_inter_mode_mask[i] = 0;
-  sf->max_intra_bsize = BLOCK_64X64;
-  // This setting only takes effect when partition_search_type is set
-  // to FIXED_PARTITION.
-  sf->always_this_block_size = BLOCK_16X16;
-
-  switch (cpi->oxcf.mode) {
-    case MODE_BESTQUALITY:
-    case MODE_SECONDPASS_BEST:  // This is the best quality mode.
-      cpi->diamond_search_sad = vp9_full_range_search;
-      break;
-    case MODE_FIRSTPASS:
-    case MODE_GOODQUALITY:
-    case MODE_SECONDPASS:
-      set_good_speed_feature(cm, sf, speed);
-      break;
-    case MODE_REALTIME:
-      set_rt_speed_feature(cm, sf, speed);
-      break;
-  }; /* switch */
+  vp9_set_speed_features(cpi);
 
   // Set rd thresholds based on mode and speed setting
   set_rd_speed_thresholds(cpi);
   set_rd_speed_thresholds_sub8x8(cpi);
 
-  // Slow quant, dct and trellis not worthwhile for first pass
-  // so make sure they are always turned off.
-  if (cpi->pass == 1) {
-    sf->optimize_coefficients = 0;
-  }
-
-  // No recode for 1 pass.
-  if (cpi->pass == 0) {
-    sf->recode_loop = DISALLOW_RECODE;
-    sf->optimize_coefficients = 0;
-  }
-
   cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
   if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
     cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
   }
-
-  if (cpi->sf.subpel_search_method == SUBPEL_TREE) {
-    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
-    cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_tree;
-  }
-
-  cpi->mb.optimize = cpi->sf.optimize_coefficients == 1 && cpi->pass != 1;
-
-  if (cpi->encode_breakout && cpi->oxcf.mode == MODE_REALTIME &&
-      sf->encode_breakout_thresh > cpi->encode_breakout)
-    cpi->encode_breakout = sf->encode_breakout_thresh;
-
-  if (sf->disable_split_mask == DISABLE_ALL_SPLIT)
-    sf->adaptive_pred_interp_filter = 0;
 }
 
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
@@ -1048,6 +581,13 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
+  if (vp9_alloc_frame_buffer(&cpi->scaled_last_source,
+                             cm->width, cm->height,
+                             cm->subsampling_x, cm->subsampling_y,
+                             VP9_ENC_BORDER_IN_PIXELS))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled last source buffer");
+
   vpx_free(cpi->tok);
 
   {
@@ -1065,24 +605,12 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
   CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map,
                   vpx_calloc(sizeof(unsigned int),
                              cm->mb_rows * cm->mb_cols));
-
-  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
-  // block where mi unit size is 8x8.
-  vpx_free(cpi->above_context[0]);
-  CHECK_MEM_ERROR(cm, cpi->above_context[0],
-                  vpx_calloc(2 * mi_cols_aligned_to_sb(cm->mi_cols) *
-                             MAX_MB_PLANE,
-                             sizeof(*cpi->above_context[0])));
-
-  vpx_free(cpi->above_seg_context);
-  CHECK_MEM_ERROR(cm, cpi->above_seg_context,
-                  vpx_calloc(mi_cols_aligned_to_sb(cm->mi_cols),
-                             sizeof(*cpi->above_seg_context)));
 }
 
 
 static void update_frame_size(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
   vp9_update_frame_size(cm);
 
@@ -1101,6 +629,13 @@ static void update_frame_size(VP9_COMP *cpi) {
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate scaled source buffer");
 
+  if (vp9_realloc_frame_buffer(&cpi->scaled_last_source,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to reallocate scaled last source buffer");
+
   {
     int y_stride = cpi->scaled_source.y_stride;
 
@@ -1111,14 +646,7 @@ static void update_frame_size(VP9_COMP *cpi) {
     }
   }
 
-  {
-    int i;
-    for (i = 1; i < MAX_MB_PLANE; ++i) {
-      cpi->above_context[i] = cpi->above_context[0] +
-                              i * sizeof(*cpi->above_context[0]) * 2 *
-                              mi_cols_aligned_to_sb(cm->mi_cols);
-    }
-  }
+  init_macroblockd(cm, xd);
 }
 
 // Table that converts 0-63 Q range values passed in outside to the Qindex
@@ -1153,10 +681,9 @@ void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
   oxcf->framerate = framerate < 0.1 ? 30 : framerate;
   cpi->output_framerate = cpi->oxcf.framerate;
   rc->av_per_frame_bandwidth = (int)(oxcf->target_bandwidth /
-                                         cpi->output_framerate);
+                                     cpi->output_framerate);
   rc->min_frame_bandwidth = (int)(rc->av_per_frame_bandwidth *
-                                      oxcf->two_pass_vbrmin_section / 100);
-
+                                  oxcf->two_pass_vbrmin_section / 100);
 
   rc->min_frame_bandwidth = MAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
 
@@ -1213,12 +740,12 @@ static void set_tile_limits(VP9_COMP *cpi) {
 
 static void init_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
-  RATE_CONTROL *const rc = &cpi->rc;
   int i;
 
   cpi->oxcf = *oxcf;
 
-  cm->version = oxcf->version;
+  cm->profile = oxcf->profile;
+  cm->bit_depth = oxcf->bit_depth;
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
@@ -1231,43 +758,16 @@ static void init_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) {
   // Temporal scalability.
   cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
 
-  if (cpi->svc.number_temporal_layers > 1 &&
-      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+  if ((cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ||
+      (cpi->svc.number_spatial_layers > 1 &&
+      cpi->oxcf.mode == MODE_SECONDPASS_BEST)) {
     vp9_init_layer_context(cpi);
   }
 
   // change includes all joint functionality
   vp9_change_config(cpi, oxcf);
 
-  // Initialize active best and worst q and average q values.
-  if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
-    rc->avg_frame_qindex[0] = cpi->oxcf.worst_allowed_q;
-    rc->avg_frame_qindex[1] = cpi->oxcf.worst_allowed_q;
-    rc->avg_frame_qindex[2] = cpi->oxcf.worst_allowed_q;
-  } else {
-    rc->avg_frame_qindex[0] = (cpi->oxcf.worst_allowed_q +
-                                  cpi->oxcf.best_allowed_q) / 2;
-    rc->avg_frame_qindex[1] = (cpi->oxcf.worst_allowed_q +
-                                  cpi->oxcf.best_allowed_q) / 2;
-    rc->avg_frame_qindex[2] = (cpi->oxcf.worst_allowed_q +
-                                  cpi->oxcf.best_allowed_q) / 2;
-  }
-  rc->last_q[0] = cpi->oxcf.best_allowed_q;
-  rc->last_q[1] = cpi->oxcf.best_allowed_q;
-  rc->last_q[2] = cpi->oxcf.best_allowed_q;
-
-  // Initialise the starting buffer levels
-  rc->buffer_level    = cpi->oxcf.starting_buffer_level;
-  rc->bits_off_target = cpi->oxcf.starting_buffer_level;
-
-  rc->rolling_target_bits      = rc->av_per_frame_bandwidth;
-  rc->rolling_actual_bits      = rc->av_per_frame_bandwidth;
-  rc->long_rolling_target_bits = rc->av_per_frame_bandwidth;
-  rc->long_rolling_actual_bits = rc->av_per_frame_bandwidth;
-
-  rc->total_actual_bits = 0;
-  rc->total_target_vs_actual = 0;
-
   cpi->static_mb_pct = 0;
 
   cpi->lst_fb_idx = 0;
@@ -1281,15 +781,18 @@ static void init_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) {
     cpi->fixed_divide[i] = 0x80000 / i;
 }
 
-void vp9_change_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) {
+void vp9_change_config(struct VP9_COMP *cpi, const VP9_CONFIG *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
 
-  if (!cpi || !oxcf)
-    return;
+  if (cm->profile != oxcf->profile)
+    cm->profile = oxcf->profile;
+  cm->bit_depth = oxcf->bit_depth;
 
-  if (cm->version != oxcf->version) {
-    cm->version = oxcf->version;
-  }
+  if (cm->profile <= PROFILE_1)
+    assert(cm->bit_depth == BITS_8);
+  else
+    assert(cm->bit_depth > BITS_8);
 
   cpi->oxcf = *oxcf;
 
@@ -1325,15 +828,17 @@ void vp9_change_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) {
       break;
   }
 
-  cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
-  cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
-  cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
-
   cpi->oxcf.lossless = oxcf->lossless;
-  cpi->mb.e_mbd.itxm_add = cpi->oxcf.lossless ? vp9_iwht4x4_add
-                                              : vp9_idct4x4_add;
-  cpi->rc.baseline_gf_interval = DEFAULT_GF_INTERVAL;
-
+  if (cpi->oxcf.lossless) {
+    // In lossless mode, make sure right quantizer range and correct transform
+    // is set.
+    cpi->oxcf.worst_allowed_q = 0;
+    cpi->oxcf.best_allowed_q = 0;
+    cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
+  } else {
+    cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
+  }
+  rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
   cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
 
   cpi->refresh_golden_frame = 0;
@@ -1382,17 +887,15 @@ void vp9_change_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) {
                     cpi->oxcf.target_bandwidth, 1000);
   // Under a configuration change, where maximum_buffer_size may change,
   // keep buffer level clipped to the maximum allowed buffer size.
-  cpi->rc.bits_off_target = MIN(cpi->rc.bits_off_target,
-                                cpi->oxcf.maximum_buffer_size);
-  cpi->rc.buffer_level = MIN(cpi->rc.buffer_level,
-                             cpi->oxcf.maximum_buffer_size);
+  rc->bits_off_target = MIN(rc->bits_off_target, cpi->oxcf.maximum_buffer_size);
+  rc->buffer_level = MIN(rc->buffer_level, cpi->oxcf.maximum_buffer_size);
 
   // Set up frame rate and related parameters rate control values.
   vp9_new_framerate(cpi, cpi->oxcf.framerate);
 
   // Set absolute upper and lower quality limits
-  cpi->rc.worst_quality = cpi->oxcf.worst_allowed_q;
-  cpi->rc.best_quality = cpi->oxcf.best_allowed_q;
+  rc->worst_quality = cpi->oxcf.worst_allowed_q;
+  rc->best_quality = cpi->oxcf.best_allowed_q;
 
   // active values should only be modified if out of new range
 
@@ -1417,8 +920,9 @@ void vp9_change_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) {
   }
   update_frame_size(cpi);
 
-  if (cpi->svc.number_temporal_layers > 1 &&
-      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+  if ((cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ||
+      (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2)) {
     vp9_update_layer_context_change_config(cpi,
                                            (int)cpi->oxcf.target_bandwidth);
   }
@@ -1434,7 +938,7 @@ void vp9_change_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) {
 #else
   cpi->alt_ref_source = NULL;
 #endif
-  cpi->rc.is_src_frame_alt_ref = 0;
+  rc->is_src_frame_alt_ref = 0;
 
 #if 0
   // Experimental RD Code
@@ -1455,7 +959,7 @@ static void cal_nmvjointsadcost(int *mvjointsadcost) {
   mvjointsadcost[0] = 600;
   mvjointsadcost[1] = 300;
   mvjointsadcost[2] = 300;
-  mvjointsadcost[0] = 300;
+  mvjointsadcost[3] = 300;
 }
 
 static void cal_nmvsadcosts(int *mvsadcost[2]) {
@@ -1611,7 +1115,6 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) {
   int i, j;
   VP9_COMP *const cpi = vpx_memalign(32, sizeof(VP9_COMP));
   VP9_COMMON *const cm = cpi != NULL ? &cpi->common : NULL;
-  RATE_CONTROL *const rc = cpi != NULL ? &cpi->rc : NULL;
 
   if (!cm)
     return NULL;
@@ -1634,6 +1137,7 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) {
   cpi->use_svc = 0;
 
   init_config(cpi, oxcf);
+  vp9_rc_init(&cpi->oxcf, cpi->pass, &cpi->rc);
   init_pick_mode_context(cpi);
 
   cm->current_video_frame = 0;
@@ -1641,8 +1145,6 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) {
   // Set reference frame sign bias for ALTREF frame to 1 (for now)
   cm->ref_frame_sign_bias[ALTREF_FRAME] = 1;
 
-  rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
-
   cpi->gold_is_last = 0;
   cpi->alt_is_last = 0;
   cpi->gold_is_alt = 0;
@@ -1656,8 +1158,8 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) {
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
   // Create a map used for cyclic background refresh.
-  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh.map,
-                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+                  vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
 
   // And a place holder structure is the coding context
   // for use if we want to save and restore it
@@ -1678,13 +1180,6 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) {
   /*Initialize the feed-forward activity masking.*/
   cpi->activity_avg = 90 << 12;
   cpi->key_frame_frequency = cpi->oxcf.key_freq;
-
-  rc->frames_since_key = 8;  // Sensible default for first frame.
-  rc->this_key_frame_forced = 0;
-  rc->next_key_frame_forced = 0;
-
-  rc->source_alt_ref_pending = 0;
-  rc->source_alt_ref_active = 0;
   cpi->refresh_alt_ref_frame = 0;
 
 #if CONFIG_MULTIPLE_ARF
@@ -1740,18 +1235,6 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) {
 
   cpi->first_time_stamp_ever = INT64_MAX;
 
-  rc->frames_till_gf_update_due = 0;
-
-  rc->ni_av_qi = cpi->oxcf.worst_allowed_q;
-  rc->ni_tot_qi = 0;
-  rc->ni_frames = 0;
-  rc->tot_q = 0.0;
-  rc->avg_q = vp9_convert_qindex_to_q(cpi->oxcf.worst_allowed_q);
-
-  rc->rate_correction_factor = 1.0;
-  rc->key_frame_rate_correction_factor = 1.0;
-  rc->gf_rate_correction_factor = 1.0;
-
   cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
   cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];
   cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];
@@ -1787,13 +1270,53 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) {
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
     const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
 
-    cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
-    cpi->twopass.stats_in = cpi->twopass.stats_in_start;
-    cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
-    vp9_init_second_pass(cpi);
+    if (cpi->svc.number_spatial_layers > 1
+        && cpi->svc.number_temporal_layers == 1) {
+      FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
+      FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = {0};
+      int i;
+
+      for (i = 0; i < oxcf->ss_number_layers; ++i) {
+        FIRSTPASS_STATS *const last_packet_for_layer =
+            &stats[packets - oxcf->ss_number_layers + i];
+        const int layer_id = (int)last_packet_for_layer->spatial_layer_id;
+        const int packets_in_layer = (int)last_packet_for_layer->count + 1;
+        if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) {
+          LAYER_CONTEXT *const lc = &cpi->svc.layer_context[layer_id];
+
+          vpx_free(lc->rc_twopass_stats_in.buf);
+
+          lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz;
+          CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf,
+                          vpx_malloc(lc->rc_twopass_stats_in.sz));
+          lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf;
+          lc->twopass.stats_in = lc->twopass.stats_in_start;
+          lc->twopass.stats_in_end = lc->twopass.stats_in_start
+                                     + packets_in_layer - 1;
+          stats_copy[layer_id] = lc->rc_twopass_stats_in.buf;
+        }
+      }
+
+      for (i = 0; i < packets; ++i) {
+        const int layer_id = (int)stats[i].spatial_layer_id;
+        if (layer_id >= 0 && layer_id < oxcf->ss_number_layers
+            && stats_copy[layer_id] != NULL) {
+          *stats_copy[layer_id] = stats[i];
+          ++stats_copy[layer_id];
+        }
+      }
+
+      vp9_init_second_pass_spatial_svc(cpi);
+    } else {
+      cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+      cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+      cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
+
+      vp9_init_second_pass(cpi);
+    }
   }
 
-  vp9_set_speed_features(cpi);
+  set_speed_features(cpi);
 
   // Default rd threshold factors for mode selection
   for (i = 0; i < BLOCK_SIZES; ++i) {
@@ -2046,53 +1569,42 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 
 #endif
 }
+static int64_t get_sse(const uint8_t *a, int a_stride,
+                       const uint8_t *b, int b_stride,
+                       int width, int height) {
+  const int dw = width % 16;
+  const int dh = height % 16;
+  int64_t total_sse = 0;
+  unsigned int sse = 0;
+  int sum = 0;
+  int x, y;
+
+  if (dw > 0) {
+    variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+             dw, height, &sse, &sum);
+    total_sse += sse;
+  }
 
+  if (dh > 0) {
+    variance(&a[(height - dh) * a_stride], a_stride,
+             &b[(height - dh) * b_stride], b_stride,
+             width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
 
-static uint64_t calc_plane_error(const uint8_t *orig, int orig_stride,
-                                 const uint8_t *recon, int recon_stride,
-                                 unsigned int cols, unsigned int rows) {
-  unsigned int row, col;
-  uint64_t total_sse = 0;
-  int diff;
-
-  for (row = 0; row + 16 <= rows; row += 16) {
-    for (col = 0; col + 16 <= cols; col += 16) {
-      unsigned int sse;
-
-      vp9_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse);
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      vp9_mse16x16(pa, a_stride, pb, b_stride, &sse);
       total_sse += sse;
-    }
-
-    /* Handle odd-sized width */
-    if (col < cols) {
-      unsigned int border_row, border_col;
-      const uint8_t *border_orig = orig;
-      const uint8_t *border_recon = recon;
 
-      for (border_row = 0; border_row < 16; border_row++) {
-        for (border_col = col; border_col < cols; border_col++) {
-          diff = border_orig[border_col] - border_recon[border_col];
-          total_sse += diff * diff;
-        }
-
-        border_orig += orig_stride;
-        border_recon += recon_stride;
-      }
+      pa += 16;
+      pb += 16;
     }
 
-    orig += orig_stride * 16;
-    recon += recon_stride * 16;
-  }
-
-  /* Handle odd-sized height */
-  for (; row < rows; row++) {
-    for (col = 0; col < cols; col++) {
-      diff = orig[col] - recon[col];
-      total_sse += diff * diff;
-    }
-
-    orig += orig_stride;
-    recon += recon_stride;
+    a += 16 * a_stride;
+    b += 16 * b_stride;
   }
 
   return total_sse;
@@ -2120,9 +1632,9 @@ static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
     const int w = widths[i];
     const int h = heights[i];
     const uint32_t samples = w * h;
-    const uint64_t sse = calc_plane_error(a_planes[i], a_strides[i],
-                                          b_planes[i], b_strides[i],
-                                          w, h);
+    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
+                                 b_planes[i], b_strides[i],
+                                 w, h);
     psnr->sse[1 + i] = sse;
     psnr->samples[1 + i] = samples;
     psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, 255.0, (double)sse);
@@ -2521,7 +2033,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
 
     vpx_usec_timer_start(&timer);
 
-    vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.use_fast_lpf_pick);
+    vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.lpf_pick);
 
     vpx_usec_timer_mark(&timer);
     cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
@@ -2596,7 +2108,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
 
   vp9_clear_system_state();
 
-  recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
+  recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
     fprintf(f, "%10u %10d %10d %10d %10d %10d "
@@ -2652,7 +2164,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi,
                                        int q) {
   VP9_COMMON *const cm = &cpi->common;
   vp9_clear_system_state();
-  vp9_set_quantizer(cpi, q);
+  vp9_set_quantizer(cm, q);
 
   // Set up entropy context depending on frame type. The decoder mandates
   // the use of the default context, index 0, for keyframes and inter
@@ -2660,21 +2172,21 @@ static void encode_without_recode_loop(VP9_COMP *cpi,
   // other inter-frames the encoder currently uses only two contexts;
   // context 1 for ALTREF frames and context 0 for the others.
   if (cm->frame_type == KEY_FRAME) {
-    vp9_setup_key_frame(cpi);
+    setup_key_frame(cpi);
   } else {
-    if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc) {
-      cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
-    }
-    vp9_setup_inter_frame(cpi);
+    if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc)
+      cm->frame_context_idx = cpi->refresh_alt_ref_frame;
+
+    setup_inter_frame(cm);
   }
   // Variance adaptive and in frame q adjustment experiments are mutually
   // exclusive.
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     vp9_vaq_frame_setup(cpi);
   } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
-    setup_in_frame_q_adj(cpi);
+    vp9_setup_in_frame_q_adj(cpi);
   } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-    vp9_setup_cyclic_refresh_aq(cpi);
+    vp9_cyclic_refresh_setup(cpi);
   }
   // transform / motion compensation build reconstruction frame
   vp9_encode_frame(cpi);
@@ -2709,7 +2221,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
   do {
     vp9_clear_system_state();
 
-    vp9_set_quantizer(cpi, q);
+    vp9_set_quantizer(cm, q);
 
     if (loop_count == 0) {
       // Set up entropy context depending on frame type. The decoder mandates
@@ -2718,12 +2230,12 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
       // other inter-frames the encoder currently uses only two contexts;
       // context 1 for ALTREF frames and context 0 for the others.
       if (cm->frame_type == KEY_FRAME) {
-        vp9_setup_key_frame(cpi);
+        setup_key_frame(cpi);
       } else {
-        if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc) {
+        if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc)
           cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
-        }
-        vp9_setup_inter_frame(cpi);
+
+        setup_inter_frame(cm);
       }
     }
 
@@ -2732,7 +2244,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
     if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
       vp9_vaq_frame_setup(cpi);
     } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
-      setup_in_frame_q_adj(cpi);
+      vp9_setup_in_frame_q_adj(cpi);
     }
 
     // transform / motion compensation build reconstruction frame
@@ -2748,13 +2260,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
     // accurate estimate of output frame size to determine if we need
     // to recode.
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
-      vp9_save_coding_context(cpi);
+      save_coding_context(cpi);
       cpi->dummy_packing = 1;
       if (!cpi->sf.use_nonrd_pick_mode)
         vp9_pack_bitstream(cpi, dest, size);
 
       rc->projected_frame_size = (int)(*size) << 3;
-      vp9_restore_coding_context(cpi);
+      restore_coding_context(cpi);
 
       if (frame_over_shoot_limit == 0)
         frame_over_shoot_limit = 1;
@@ -2767,7 +2279,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
            rc->this_key_frame_forced &&
            (rc->projected_frame_size < rc->max_frame_bandwidth)) {
         int last_q = q;
-        int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
+        int kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 
         int high_err_target = cpi->ambient_err;
         int low_err_target = cpi->ambient_err >> 1;
@@ -2971,6 +2483,19 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   } else {
     cpi->Source = cpi->un_scaled_source;
   }
+
+  // Scale the last source buffer, if required.
+  if (cpi->unscaled_last_source != NULL) {
+    if (cm->mi_cols * MI_SIZE != cpi->unscaled_last_source->y_width ||
+        cm->mi_rows * MI_SIZE != cpi->unscaled_last_source->y_height) {
+      scale_and_extend_frame_nonnormative(cpi->unscaled_last_source,
+                                          &cpi->scaled_last_source);
+      cpi->Last_Source = &cpi->scaled_last_source;
+    } else {
+      cpi->Last_Source = cpi->unscaled_last_source;
+    }
+  }
+
   vp9_scale_references(cpi);
 
   vp9_clear_system_state();
@@ -3008,7 +2533,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   // Set various flags etc to special state if it is a key frame.
   if (frame_is_intra_only(cm)) {
-    vp9_setup_key_frame(cpi);
+    setup_key_frame(cpi);
     // Reset the loop filter deltas and segmentation map.
     vp9_reset_segment_features(&cm->seg);
 
@@ -3090,6 +2615,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   vp9_write_yuv_frame(cpi->Source);
 #endif
 
+  set_speed_features(cpi);
+
   // Decide q and q bounds.
   q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index);
 
@@ -3099,8 +2626,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH);
   }
 
-  vp9_set_speed_features(cpi);
-
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
     encode_without_recode_loop(cpi, size, dest, q);
   } else {
@@ -3111,7 +2636,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
   if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
-    cpi->ambient_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
+    cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
   }
 
   // If the encoder forced a KEY_FRAME decision
@@ -3231,11 +2756,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
     ++cm->current_video_frame;
+    if (cpi->use_svc)
+      vp9_inc_frame_in_layer(&cpi->svc);
   }
 
   // restore prev_mi
-  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
-  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
+  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
 }
 
 static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
@@ -3261,7 +2788,7 @@ static void Pass1Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
   (void) frame_flags;
 
   vp9_rc_get_first_pass_params(cpi);
-  vp9_set_quantizer(cpi, find_fp_qindex());
+  vp9_set_quantizer(&cpi->common, find_fp_qindex());
   vp9_first_pass(cpi);
 }
 
@@ -3272,7 +2799,7 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size,
   vp9_rc_get_second_pass_params(cpi);
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 
-  vp9_twopass_postencode_update(cpi, *size);
+  vp9_twopass_postencode_update(cpi);
 }
 
 static void check_initial_width(VP9_COMP *cpi, int subsampling_x,
@@ -3306,7 +2833,7 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
   vpx_usec_timer_mark(&timer);
   cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
 
-  if (cm->version == 0 && (subsampling_x != 1 || subsampling_y != 1)) {
+  if (cm->profile == PROFILE_0 && (subsampling_x != 1 || subsampling_y != 1)) {
     vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
                        "Non-4:2:0 color space requires profile >= 1");
     res = -1;
@@ -3377,8 +2904,9 @@ void adjust_frame_rate(VP9_COMP *cpi) {
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest,
                             int64_t *time_stamp, int64_t *time_end, int flush) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  RATE_CONTROL *const rc = &cpi->rc;
   struct vpx_usec_timer  cmptimer;
   YV12_BUFFER_CONFIG *force_src_buffer = NULL;
   MV_REFERENCE_FRAME ref_frame;
@@ -3386,9 +2914,14 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   if (!cpi)
     return -1;
 
+  if (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2) {
+    vp9_restore_layer_context(cpi);
+  }
+
   vpx_usec_timer_start(&cmptimer);
 
   cpi->source = NULL;
+  cpi->last_source = NULL;
 
   set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
 
@@ -3400,7 +2933,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   cpi->refresh_alt_ref_frame = 0;
 
   // Should we code an alternate reference frame.
-  if (cpi->oxcf.play_alternate && cpi->rc.source_alt_ref_pending) {
+  if (cpi->oxcf.play_alternate && rc->source_alt_ref_pending) {
     int frames_to_arf;
 
 #if CONFIG_MULTIPLE_ARF
@@ -3412,9 +2945,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
           - cpi->next_frame_in_order;
     else
 #endif
-      frames_to_arf = cpi->rc.frames_till_gf_update_due;
+      frames_to_arf = rc->frames_till_gf_update_due;
 
-    assert(frames_to_arf <= cpi->rc.frames_to_key);
+    assert(frames_to_arf <= rc->frames_to_key);
 
     if ((cpi->source = vp9_lookahead_peek(cpi->lookahead, frames_to_arf))) {
 #if CONFIG_MULTIPLE_ARF
@@ -3426,7 +2959,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       if (cpi->oxcf.arnr_max_frames > 0) {
         // Produce the filtered ARF frame.
         // TODO(agrange) merge these two functions.
-        vp9_configure_arnr_filter(cpi, frames_to_arf, cpi->rc.gfu_boost);
+        vp9_configure_arnr_filter(cpi, frames_to_arf, rc->gfu_boost);
         vp9_temporal_filter_prepare(cpi, frames_to_arf);
         vp9_extend_frame_borders(&cpi->alt_ref_buffer);
         force_src_buffer = &cpi->alt_ref_buffer;
@@ -3436,14 +2969,14 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       cpi->refresh_alt_ref_frame = 1;
       cpi->refresh_golden_frame = 0;
       cpi->refresh_last_frame = 0;
-      cpi->rc.is_src_frame_alt_ref = 0;
+      rc->is_src_frame_alt_ref = 0;
 
 #if CONFIG_MULTIPLE_ARF
       if (!cpi->multi_arf_enabled)
 #endif
-        cpi->rc.source_alt_ref_pending = 0;
+        rc->source_alt_ref_pending = 0;
     } else {
-      cpi->rc.source_alt_ref_pending = 0;
+      rc->source_alt_ref_pending = 0;
     }
   }
 
@@ -3451,25 +2984,32 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 #if CONFIG_MULTIPLE_ARF
     int i;
 #endif
+
+    // Get last frame source.
+    if (cm->current_video_frame > 0) {
+      if ((cpi->last_source = vp9_lookahead_peek(cpi->lookahead, -1)) == NULL)
+        return -1;
+    }
+
     if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {
       cm->show_frame = 1;
       cm->intra_only = 0;
 
 #if CONFIG_MULTIPLE_ARF
       // Is this frame the ARF overlay.
-      cpi->rc.is_src_frame_alt_ref = 0;
+      rc->is_src_frame_alt_ref = 0;
       for (i = 0; i < cpi->arf_buffered; ++i) {
         if (cpi->source == cpi->alt_ref_source[i]) {
-          cpi->rc.is_src_frame_alt_ref = 1;
+          rc->is_src_frame_alt_ref = 1;
           cpi->refresh_golden_frame = 1;
           break;
         }
       }
 #else
-      cpi->rc.is_src_frame_alt_ref = cpi->alt_ref_source
-          && (cpi->source == cpi->alt_ref_source);
+      rc->is_src_frame_alt_ref = cpi->alt_ref_source &&
+                                 (cpi->source == cpi->alt_ref_source);
 #endif
-      if (cpi->rc.is_src_frame_alt_ref) {
+      if (rc->is_src_frame_alt_ref) {
         // Current frame is an ARF overlay frame.
 #if CONFIG_MULTIPLE_ARF
         cpi->alt_ref_source[i] = NULL;
@@ -3489,13 +3029,20 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   if (cpi->source) {
     cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer
                                                            : &cpi->source->img;
+
+  if (cpi->last_source != NULL) {
+    cpi->unscaled_last_source = &cpi->last_source->img;
+  } else {
+    cpi->unscaled_last_source = NULL;
+  }
+
     *time_stamp = cpi->source->ts_start;
     *time_end = cpi->source->ts_end;
     *frame_flags = cpi->source->flags;
 
 #if CONFIG_MULTIPLE_ARF
-    if ((cm->frame_type != KEY_FRAME) && (cpi->pass == 2))
-      cpi->rc.source_alt_ref_pending = is_next_frame_arf(cpi);
+    if (cm->frame_type != KEY_FRAME && cpi->pass == 2)
+      rc->source_alt_ref_pending = is_next_frame_arf(cpi);
 #endif
   } else {
     *size = 0;
@@ -3518,7 +3065,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
   if (cpi->svc.number_temporal_layers > 1 &&
       cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
-    vp9_update_layer_framerate(cpi);
+    vp9_update_temporal_layer_framerate(cpi);
     vp9_restore_layer_context(cpi);
   }
 
@@ -3568,8 +3115,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
-  xd->interp_kernel = vp9_get_interp_kernel(
-      DEFAULT_INTERP_FILTER == SWITCHABLE ? EIGHTTAP : DEFAULT_INTERP_FILTER);
 
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     vp9_vaq_init();
@@ -3578,7 +3123,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   if (cpi->pass == 1 &&
       (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
     Pass1Encode(cpi, size, dest, frame_flags);
-  } else if (cpi->pass == 2 && !cpi->use_svc) {
+  } else if (cpi->pass == 2 &&
+      (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
     Pass2Encode(cpi, size, dest, frame_flags);
   } else if (cpi->use_svc) {
     SvcEncode(cpi, size, dest, frame_flags);
@@ -3600,8 +3146,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 
   // Save layer specific state.
-  if (cpi->svc.number_temporal_layers > 1 &&
-      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+  if ((cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ||
+      (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2)) {
     vp9_save_layer_context(cpi);
   }
 
@@ -3844,28 +3391,12 @@ void vp9_set_svc(VP9_COMP *cpi, int use_svc) {
   return;
 }
 
-int vp9_calc_ss_err(const YV12_BUFFER_CONFIG *source,
-                    const YV12_BUFFER_CONFIG *reference) {
-  int i, j;
-  int total = 0;
-
-  const uint8_t *src = source->y_buffer;
-  const uint8_t *ref = reference->y_buffer;
-
-  // Loop through the Y plane raw and reconstruction data summing
-  // (square differences)
-  for (i = 0; i < source->y_height; i += 16) {
-    for (j = 0; j < source->y_width; j += 16) {
-      unsigned int sse;
-      total += vp9_mse16x16(src + j, source->y_stride,
-                            ref + j, reference->y_stride, &sse);
-    }
-
-    src += 16 * source->y_stride;
-    ref += 16 * reference->y_stride;
-  }
+int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
 
-  return total;
+  return (int)get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                      a->y_crop_width, a->y_crop_height);
 }
 
 
diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h
index f4b44ce..e30fb02 100644
--- a/libvpx/vp9/encoder/vp9_onyx_int.h
+++ b/libvpx/vp9/encoder/vp9_onyx_int.h
@@ -23,6 +23,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_lookahead.h"
@@ -30,6 +31,7 @@
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_variance.h"
@@ -114,75 +116,6 @@ typedef enum {
 } THR_MODES_SUB8X8;
 
 typedef enum {
-  DIAMOND = 0,
-  NSTEP = 1,
-  HEX = 2,
-  BIGDIA = 3,
-  SQUARE = 4,
-  FAST_HEX = 5,
-  FAST_DIAMOND = 6
-} SEARCH_METHODS;
-
-typedef enum {
-  USE_FULL_RD = 0,
-  USE_LARGESTINTRA,
-  USE_LARGESTINTRA_MODELINTER,
-  USE_LARGESTALL
-} TX_SIZE_SEARCH_METHOD;
-
-typedef enum {
-  NOT_IN_USE = 0,
-  RELAXED_NEIGHBORING_MIN_MAX = 1,
-  STRICT_NEIGHBORING_MIN_MAX = 2
-} AUTO_MIN_MAX_MODE;
-
-typedef enum {
-  // Terminate search early based on distortion so far compared to
-  // qp step, distortion in the neighborhood of the frame, etc.
-  FLAG_EARLY_TERMINATE = 1 << 0,
-
-  // Skips comp inter modes if the best so far is an intra mode.
-  FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
-
-  // Skips comp inter modes if the best single intermode so far does
-  // not have the same reference as one of the two references being
-  // tested.
-  FLAG_SKIP_COMP_REFMISMATCH = 1 << 2,
-
-  // Skips oblique intra modes if the best so far is an inter mode.
-  FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
-
-  // Skips oblique intra modes  at angles 27, 63, 117, 153 if the best
-  // intra so far is not one of the neighboring directions.
-  FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
-
-  // Skips intra modes other than DC_PRED if the source variance is small
-  FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
-} MODE_SEARCH_SKIP_LOGIC;
-
-typedef enum {
-  SUBPEL_TREE = 0,
-  // Other methods to come
-} SUBPEL_SEARCH_METHODS;
-
-typedef enum {
-  LAST_FRAME_PARTITION_OFF = 0,
-  LAST_FRAME_PARTITION_LOW_MOTION = 1,
-  LAST_FRAME_PARTITION_ALL = 2
-} LAST_FRAME_PARTITION_METHOD;
-
-typedef enum {
-  // No recode.
-  DISALLOW_RECODE = 0,
-  // Allow recode for KF and exceeding maximum frame bandwidth.
-  ALLOW_RECODE_KFMAXBW = 1,
-  // Allow recode only for KF/ARF/GF frames.
-  ALLOW_RECODE_KFARFGF = 2,
-  // Allow recode for all frames based on bitrate constraints.
-  ALLOW_RECODE = 3,
-} RECODE_LOOP_TYPE;
-
-typedef enum {
   // encode_breakout is disabled.
   ENCODE_BREAKOUT_DISABLED = 0,
   // encode_breakout is enabled.
@@ -192,225 +125,6 @@ typedef enum {
 } ENCODE_BREAKOUT_TYPE;
 
 typedef enum {
-  // Search partitions using RD/NONRD criterion
-  SEARCH_PARTITION = 0,
-
-  // Always use a fixed size partition
-  FIXED_PARTITION = 1,
-
-  // Use a fixed size partition in every 64X64 SB, where the size is
-  // determined based on source variance
-  VAR_BASED_FIXED_PARTITION = 2,
-
-  REFERENCE_PARTITION = 3,
-
-  // Use an arbitrary partitioning scheme based on source variance within
-  // a 64X64 SB
-  VAR_BASED_PARTITION
-} PARTITION_SEARCH_TYPE;
-
-typedef struct {
-  // Frame level coding parameter update
-  int frame_parameter_update;
-
-  // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
-  SEARCH_METHODS search_method;
-
-  RECODE_LOOP_TYPE recode_loop;
-
-  // Subpel_search_method can only be subpel_tree which does a subpixel
-  // logarithmic search that keeps stepping at 1/2 pixel units until
-  // you stop getting a gain, and then goes on to 1/4 and repeats
-  // the same process. Along the way it skips many diagonals.
-  SUBPEL_SEARCH_METHODS subpel_search_method;
-
-  // Maximum number of steps in logarithmic subpel search before giving up.
-  int subpel_iters_per_step;
-
-  // Control when to stop subpel search
-  int subpel_force_stop;
-
-  // Thresh_mult is used to set a threshold for the rd score. A higher value
-  // means that we will accept the best mode so far more often. This number
-  // is used in combination with the current block size, and thresh_freq_fact
-  // to pick a threshold.
-  int thresh_mult[MAX_MODES];
-  int thresh_mult_sub8x8[MAX_REFS];
-
-  // This parameter controls the number of steps we'll do in a diamond
-  // search.
-  int max_step_search_steps;
-
-  // This parameter controls which step in the n-step process we start at.
-  // It's changed adaptively based on circumstances.
-  int reduce_first_step_size;
-
-  // If this is set to 1, we limit the motion search range to 2 times the
-  // largest motion vector found in the last frame.
-  int auto_mv_step_size;
-
-  // Trellis (dynamic programming) optimization of quantized values (+1, 0).
-  int optimize_coefficients;
-
-  // Always set to 0. If on it enables 0 cost background transmission
-  // (except for the initial transmission of the segmentation). The feature is
-  // disabled because the addition of very large block sizes make the
-  // backgrounds very to cheap to encode, and the segmentation we have
-  // adds overhead.
-  int static_segmentation;
-
-  // If 1 we iterate finding a best reference for 2 ref frames together - via
-  // a log search that iterates 4 times (check around mv for last for best
-  // error of combined predictor then check around mv for alt). If 0 we
-  // we just use the best motion vector found for each frame by itself.
-  int comp_inter_joint_search_thresh;
-
-  // This variable is used to cap the maximum number of times we skip testing a
-  // mode to be evaluated. A high value means we will be faster.
-  int adaptive_rd_thresh;
-
-  // Enables skipping the reconstruction step (idct, recon) in the
-  // intermediate steps assuming the last frame didn't have too many intra
-  // blocks and the q is less than a threshold.
-  int skip_encode_sb;
-  int skip_encode_frame;
-
-  // This variable allows us to reuse the last frames partition choices
-  // (64x64 v 32x32 etc) for this frame. It can be set to only use the last
-  // frame as a starting point in low motion scenes or always use it. If set
-  // we use last partitioning_redo frequency to determine how often to redo
-  // the partitioning from scratch. Adjust_partitioning_from_last_frame
-  // enables us to adjust up or down one partitioning from the last frames
-  // partitioning.
-  LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
-
-  // Determine which method we use to determine transform size. We can choose
-  // between options like full rd, largest for prediction size, largest
-  // for intra and model coefs for the rest.
-  TX_SIZE_SEARCH_METHOD tx_size_search_method;
-
-  // Low precision 32x32 fdct keeps everything in 16 bits and thus is less
-  // precise but significantly faster than the non lp version.
-  int use_lp32x32fdct;
-
-  // TODO(JBB): remove this as its no longer used.
-
-  // After looking at the first set of modes (set by index here), skip
-  // checking modes for reference frames that don't match the reference frame
-  // of the best so far.
-  int mode_skip_start;
-
-  // TODO(JBB): Remove this.
-  int reference_masking;
-
-  PARTITION_SEARCH_TYPE partition_search_type;
-
-  // Used if partition_search_type = FIXED_SIZE_PARTITION
-  BLOCK_SIZE always_this_block_size;
-
-  // Skip rectangular partition test when partition type none gives better
-  // rd than partition type split.
-  int less_rectangular_check;
-
-  // Disable testing non square partitions. (eg 16x32)
-  int use_square_partition_only;
-
-  // Sets min and max partition sizes for this 64x64 region based on the
-  // same 64x64 in last encoded frame, and the left and above neighbor.
-  AUTO_MIN_MAX_MODE auto_min_max_partition_size;
-
-  // Min and max partition size we enable (block_size) as per auto
-  // min max, but also used by adjust partitioning, and pick_partitioning.
-  BLOCK_SIZE min_partition_size;
-  BLOCK_SIZE max_partition_size;
-
-  // Whether or not we allow partitions one smaller or one greater than the last
-  // frame's partitioning. Only used if use_lastframe_partitioning is set.
-  int adjust_partitioning_from_last_frame;
-
-  // How frequently we re do the partitioning from scratch. Only used if
-  // use_lastframe_partitioning is set.
-  int last_partitioning_redo_frequency;
-
-  // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
-  // it always, to allow it for only Last frame and Intra, disable it for all
-  // inter modes or to enable it always.
-  int disable_split_mask;
-
-  // TODO(jingning): combine the related motion search speed features
-  // This allows us to use motion search at other sizes as a starting
-  // point for this motion search and limits the search range around it.
-  int adaptive_motion_search;
-
-  // Allows sub 8x8 modes to use the prediction filter that was determined
-  // best for 8x8 mode. If set to 0 we always re check all the filters for
-  // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
-  // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
-  int adaptive_pred_interp_filter;
-
-  // Search through variable block partition types in non-RD mode decision
-  // encoding process for RTC.
-  int partition_check;
-
-  // Implements various heuristics to skip searching modes
-  // The heuristics selected are based on  flags
-  // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
-  unsigned int mode_search_skip_flags;
-
-  // A source variance threshold below which the split mode is disabled
-  unsigned int disable_split_var_thresh;
-
-  // A source variance threshold below which filter search is disabled
-  // Choose a very large value (UINT_MAX) to use 8-tap always
-  unsigned int disable_filter_search_var_thresh;
-
-  // These bit masks allow you to enable or disable intra modes for each
-  // transform size separately.
-  int intra_y_mode_mask[TX_SIZES];
-  int intra_uv_mode_mask[TX_SIZES];
-
-  // This variable enables an early break out of mode testing if the model for
-  // rd built from the prediction signal indicates a value that's much
-  // higher than the best rd we've seen so far.
-  int use_rd_breakout;
-
-  // This enables us to use an estimate for intra rd based on dc mode rather
-  // than choosing an actual uv mode in the stage of encoding before the actual
-  // final encode.
-  int use_uv_intra_rd_estimate;
-
-  // This feature controls how the loop filter level is determined:
-  // 0: Try the full image with different values.
-  // 1: Try a small portion of the image with different values.
-  // 2: Estimate the level based on quantizer and frame type
-  int use_fast_lpf_pick;
-
-  // This feature limits the number of coefficients updates we actually do
-  // by only looking at counts from 1/2 the bands.
-  int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
-
-  // This flag controls the use of non-RD mode decision.
-  int use_nonrd_pick_mode;
-
-  // This variable sets the encode_breakout threshold. Currently, it is only
-  // enabled in real time mode.
-  int encode_breakout_thresh;
-
-  // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
-  // modes are disabled in order from LSB to MSB for each BLOCK_SIZE.
-  int disable_inter_mode_mask[BLOCK_SIZES];
-
-  // This feature controls whether we do the expensive context update and
-  // calculation in the rd coefficient costing loop.
-  int use_fast_coef_costing;
-
-  // This variable controls the maximum block size where intra blocks can be
-  // used in inter frames.
-  // TODO(aconverse): Fold this into one of the other many mode skips
-  BLOCK_SIZE max_intra_bsize;
-} SPEED_FEATURES;
-
-typedef enum {
   NORMAL      = 0,
   FOURFIVE    = 1,
   THREEFIVE   = 2,
@@ -418,44 +132,12 @@ typedef enum {
 } VPX_SCALING;
 
 typedef enum {
-  VP9_LAST_FLAG = 1 << 0,
-  VP9_GOLD_FLAG = 1 << 1,
-  VP9_ALT_FLAG = 1 << 2,
-} VP9_REFFRAME;
-
-typedef enum {
   USAGE_LOCAL_FILE_PLAYBACK = 0,
   USAGE_STREAM_FROM_SERVER  = 1,
   USAGE_CONSTRAINED_QUALITY = 2,
   USAGE_CONSTANT_QUALITY    = 3,
 } END_USAGE;
 
-typedef struct {
-  // Target percentage of blocks per frame that are cyclicly refreshed.
-  int max_mbs_perframe;
-  // Maximum q-delta as percentage of base q.
-  int max_qdelta_perc;
-  // Block size below which we don't apply cyclic refresh.
-  BLOCK_SIZE min_block_size;
-  // Macroblock starting index (unit of 8x8) for cycling through the frame.
-  int mb_index;
-  // Controls how long a block will need to wait to be refreshed again.
-  int time_for_refresh;
-  // Actual number of blocks that were applied delta-q (segment 1).
-  int num_seg_blocks;
-  // Actual encoding bits for segment 1.
-  int actual_seg_bits;
-  // RD mult. parameters for segment 1.
-  int rdmult;
-  // Cyclic refresh map.
-  signed char *map;
-  // Projected rate and distortion for the current superblock.
-  int64_t projected_rate_sb;
-  int64_t projected_dist_sb;
-  // Thresholds applied to projected rate/distortion of the superblock.
-  int64_t thresh_rate_sb;
-  int64_t thresh_dist_sb;
-} CYCLIC_REFRESH;
 typedef enum {
   // Good Quality Fast Encoding. The encoder balances quality with the
   // amount of time it takes to encode the output. (speed setting
@@ -503,10 +185,9 @@ typedef enum {
   AQ_MODE_COUNT  // This should always be the last member of the enum
 } AQ_MODE;
 
-typedef struct {
-  int version;  // 4 versions of bitstream defined:
-                //   0 - best quality/slowest decode,
-                //   3 - lowest quality/fastest decode
+typedef struct VP9_CONFIG {
+  BITSTREAM_PROFILE profile;
+  BIT_DEPTH bit_depth;
   int width;  // width of data passed to the compressor
   int height;  // height of data passed to the compressor
   double framerate;  // set to passed in framerate
@@ -550,6 +231,9 @@ typedef struct {
   int lossless;
   AQ_MODE aq_mode;  // Adaptive Quantization mode
 
+  // Enable feature to reduce the frame quantization every x frames.
+  int frame_periodic_boost;
+
   // two pass datarate control
   int two_pass_vbrbias;        // two pass datarate control tweaks
   int two_pass_vbrmin_section;
@@ -598,23 +282,7 @@ typedef struct {
 } VP9_CONFIG;
 
 typedef struct VP9_COMP {
-  DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
-
-  DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
-
-#if CONFIG_ALPHA
-  DECLARE_ALIGNED(16, int16_t, a_quant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, a_quant_shift[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, a_zbin[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, a_round[QINDEX_RANGE][8]);
-#endif
-
+  QUANTS quants;
   MACROBLOCK mb;
   VP9_COMMON common;
   VP9_CONFIG oxcf;
@@ -625,10 +293,14 @@ typedef struct VP9_COMP {
 #else
   struct lookahead_entry  *alt_ref_source;
 #endif
+  struct lookahead_entry  *last_source;
 
   YV12_BUFFER_CONFIG *Source;
+  YV12_BUFFER_CONFIG *Last_Source;  // NULL for first frame and alt_ref frames
   YV12_BUFFER_CONFIG *un_scaled_source;
   YV12_BUFFER_CONFIG scaled_source;
+  YV12_BUFFER_CONFIG *unscaled_last_source;
+  YV12_BUFFER_CONFIG scaled_last_source;
 
   int key_frame_frequency;
 
@@ -671,6 +343,13 @@ typedef struct VP9_COMP {
   // Ambient reconstruction err target for force key frames
   int ambient_err;
 
+  // Thresh_mult is used to set a threshold for the rd score. A higher value
+  // means that we will accept the best mode so far more often. This number
+  // is used in combination with the current block size, and thresh_freq_fact
+  // to pick a threshold.
+  int rd_thresh_mult[MAX_MODES];
+  int rd_thresh_mult_sub8x8[MAX_REFS];
+
   int rd_threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
   int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
   int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS];
@@ -708,14 +387,12 @@ typedef struct VP9_COMP {
 
   vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
-  vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
 
   struct vpx_codec_pkt_list  *output_pkt_list;
 
   MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
   int mbgraph_n_frames;             // number of frames filled in the above
   int static_mb_pct;                // % forced skip mbs by segmentation
-  int seg0_progress, seg0_idx, seg0_cnt;
 
   // for real time encoding
   int speed;
@@ -747,7 +424,7 @@ typedef struct VP9_COMP {
   unsigned char *active_map;
   unsigned int active_map_enabled;
 
-  CYCLIC_REFRESH cyclic_refresh;
+  CYCLIC_REFRESH *cyclic_refresh;
 
   fractional_mv_step_fp *find_fractional_mv_step;
   fractional_mv_step_comp_fp *find_fractional_mv_step_comp;
@@ -805,10 +482,6 @@ typedef struct VP9_COMP {
   unsigned int activity_avg;
   unsigned int *mb_activity_map;
   int *mb_norm_activity_map;
-  int output_partition;
-
-  // Force next frame to intra when kf_auto says so.
-  int force_next_frame_intra;
 
   int droppable;
 
@@ -823,6 +496,8 @@ typedef struct VP9_COMP {
 
   SVC svc;
 
+  int use_large_partition_rate;
+
 #if CONFIG_MULTIPLE_ARF
   // ARF tracking variables.
   int multi_arf_enabled;
@@ -840,13 +515,6 @@ typedef struct VP9_COMP {
   // Debug / test stats
   int64_t mode_test_hits[BLOCK_SIZES];
 #endif
-
-  // Y,U,V,(A)
-  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-
-  PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[8];
 } VP9_COMP;
 
 void vp9_initialize_enc();
@@ -854,7 +522,7 @@ void vp9_initialize_enc();
 struct VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf);
 void vp9_remove_compressor(VP9_COMP *cpi);
 
-void vp9_change_config(VP9_COMP *cpi, VP9_CONFIG *oxcf);
+void vp9_change_config(VP9_COMP *cpi, const VP9_CONFIG *oxcf);
 
   // receive a frames worth of data. caller can assume that a copy of this
   // frame is made and not just a copy of the pointer..
@@ -903,8 +571,8 @@ void vp9_set_svc(VP9_COMP *cpi, int use_svc);
 
 int vp9_get_quantizer(struct VP9_COMP *cpi);
 
-static int get_ref_frame_idx(const VP9_COMP *cpi,
-                             MV_REFERENCE_FRAME ref_frame) {
+static INLINE int get_ref_frame_idx(const VP9_COMP *cpi,
+                                    MV_REFERENCE_FRAME ref_frame) {
   if (ref_frame == LAST_FRAME) {
     return cpi->lst_fb_idx;
   } else if (ref_frame == GOLDEN_FRAME) {
@@ -914,39 +582,45 @@ static int get_ref_frame_idx(const VP9_COMP *cpi,
   }
 }
 
-static YV12_BUFFER_CONFIG *get_ref_frame_buffer(VP9_COMP *cpi,
-                                                MV_REFERENCE_FRAME ref_frame) {
-  VP9_COMMON *const cm = &cpi->common;
-  return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi,
-                                                             ref_frame)]].buf;
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
+    VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+  VP9_COMMON * const cm = &cpi->common;
+  return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]]
+      .buf;
 }
 
-void vp9_set_speed_features(VP9_COMP *cpi);
-
-int vp9_calc_ss_err(const YV12_BUFFER_CONFIG *source,
-                    const YV12_BUFFER_CONFIG *reference);
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static INLINE int vp9_frame_is_boosted(const VP9_COMP *cpi) {
+  return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
 
-void vp9_alloc_compressor_data(VP9_COMP *cpi);
+static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
+  // TODO(JBB): make this work for alpha channel and double check we can't
+  // exceed this token count if we have a 32x32 transform crossing a boundary
+  // at a multiple of 16.
+  // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full
+  // resolution. We assume up to 1 token per pixel, and then allow
+  // a head room of 4.
+  return mb_rows * mb_cols * (16 * 16 * 3 + 4);
+}
 
-int vp9_compute_qdelta(const VP9_COMP *cpi, double qstart, double qtarget);
+int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 
-int vp9_compute_qdelta_by_rate(VP9_COMP *cpi, int base_q_index,
-                               double rate_target_ratio);
+void vp9_alloc_compressor_data(VP9_COMP *cpi);
 
 void vp9_scale_references(VP9_COMP *cpi);
 
 void vp9_update_reference_frames(VP9_COMP *cpi);
 
-static int get_token_alloc(int mb_rows, int mb_cols) {
-  return mb_rows * mb_cols * (48 * 16 + 4);
-}
-
 extern const int q_trans[];
 
 int64_t vp9_rescale(int64_t val, int64_t num, int denom);
 
-static void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
-                         MV_REFERENCE_FRAME ref0, MV_REFERENCE_FRAME ref1) {
+static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                MV_REFERENCE_FRAME ref0,
+                                MV_REFERENCE_FRAME ref1) {
   xd->block_refs[0] = &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME
                                                          : 0];
   xd->block_refs[1] = &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME
diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c
index b5f4901..3ac8522 100644
--- a/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/libvpx/vp9/encoder/vp9_picklpf.c
@@ -10,16 +10,18 @@
 
 #include <assert.h>
 #include <limits.h>
+
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_quant_common.h"
+
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_alloccommon.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "./vpx_scale_rtcd.h"
 
 static int get_max_filter_level(VP9_COMP *cpi) {
   return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
@@ -28,12 +30,12 @@ static int get_max_filter_level(VP9_COMP *cpi) {
 
 
 static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi,
-                            MACROBLOCKD *const xd, VP9_COMMON *const cm,
                             int filt_level, int partial_frame) {
+  VP9_COMMON *const cm = &cpi->common;
   int filt_err;
 
-  vp9_loop_filter_frame(cm, xd, filt_level, 1, partial_frame);
-  filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
+  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_level, 1, partial_frame);
+  filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
 
   // Re-instate the unfiltered frame
   vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
@@ -43,7 +45,6 @@ static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi,
 
 static void search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
                                 int partial_frame) {
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   VP9_COMMON *const cm = &cpi->common;
   struct loopfilter *const lf = &cm->lf;
   const int min_filter_level = 0;
@@ -64,7 +65,7 @@ static void search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
   //  Make a copy of the unfiltered / processed recon buffer
   vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
 
-  best_err = try_filter_frame(sd, cpi, xd, cm, filt_mid, partial_frame);
+  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame);
   filt_best = filt_mid;
   ss_err[filt_mid] = best_err;
 
@@ -86,7 +87,7 @@ static void search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
     if (filt_direction <= 0 && filt_low != filt_mid) {
       // Get Low filter error score
       if (ss_err[filt_low] < 0) {
-        filt_err = try_filter_frame(sd, cpi, xd, cm, filt_low, partial_frame);
+        filt_err = try_filter_frame(sd, cpi, filt_low, partial_frame);
         ss_err[filt_low] = filt_err;
       } else {
         filt_err = ss_err[filt_low];
@@ -105,7 +106,7 @@ static void search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
     // Now look at filt_high
     if (filt_direction >= 0 && filt_high != filt_mid) {
       if (ss_err[filt_high] < 0) {
-        filt_err = try_filter_frame(sd, cpi, xd, cm, filt_high, partial_frame);
+        filt_err = try_filter_frame(sd, cpi, filt_high, partial_frame);
         ss_err[filt_high] = filt_err;
       } else {
         filt_err = ss_err[filt_high];
@@ -119,7 +120,7 @@ static void search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
 
     // Half the step distance if the best filter value was the same as last time
     if (filt_best == filt_mid) {
-      filter_step = filter_step / 2;
+      filter_step /= 2;
       filt_direction = 0;
     } else {
       filt_direction = (filt_best < filt_mid) ? -1 : 1;
@@ -131,25 +132,24 @@ static void search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
 }
 
 void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
-                           int method) {
+                           LPF_PICK_METHOD method) {
   VP9_COMMON *const cm = &cpi->common;
   struct loopfilter *const lf = &cm->lf;
 
   lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
                                                     : cpi->oxcf.sharpness;
 
-  if (method == 2) {
+  if (method == LPF_PICK_FROM_Q) {
     const int min_filter_level = 0;
     const int max_filter_level = get_max_filter_level(cpi);
     const int q = vp9_ac_quant(cm->base_qindex, 0);
     // These values were determined by linear fitting the result of the
-    // searched level
-    // filt_guess = q * 0.316206 + 3.87252
-    int filt_guess = (q * 20723 + 1015158 + (1 << 17)) >> 18;
+    // searched level, filt_guess = q * 0.316206 + 3.87252
+    int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
     if (cm->frame_type == KEY_FRAME)
       filt_guess -= 4;
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
   } else {
-    search_filter_level(sd, cpi, method == 1);
+    search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE);
   }
 }
diff --git a/libvpx/vp9/encoder/vp9_picklpf.h b/libvpx/vp9/encoder/vp9_picklpf.h
index 203ef87..7d08ddb 100644
--- a/libvpx/vp9/encoder/vp9_picklpf.h
+++ b/libvpx/vp9/encoder/vp9_picklpf.h
@@ -16,11 +16,13 @@
 extern "C" {
 #endif
 
+#include "vp9/encoder/vp9_onyx_int.h"
+
 struct yv12_buffer_config;
 struct VP9_COMP;
 
 void vp9_pick_filter_level(const struct yv12_buffer_config *sd,
-                           struct VP9_COMP *cpi, int method);
+                           struct VP9_COMP *cpi, LPF_PICK_METHOD method);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vp9/encoder/vp9_pickmode.c b/libvpx/vp9/encoder/vp9_pickmode.c
index 6c84144..f3fe99c 100644
--- a/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/libvpx/vp9/encoder/vp9_pickmode.c
@@ -29,9 +29,9 @@
 static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                     const TileInfo *const tile,
                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                    int_mv *tmp_mv) {
+                                    int_mv *tmp_mv, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   int step_param;
   int sadpb = x->sadperbit16;
@@ -76,8 +76,11 @@ static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
       return;
     }
   }
-
-  mvp_full = mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
+  assert(x->mv_best_ref_index[ref] <= 2);
+  if (x->mv_best_ref_index[ref] < 2)
+    mvp_full = mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
+  else
+    mvp_full = x->pred_mv[ref].as_mv;
 
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
@@ -125,14 +128,20 @@ static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     for (i = 0; i < MAX_MB_PLANE; i++)
       xd->plane[i].pre[0] = backup_yv12[i];
   }
+
+  // calculate the bit cost on motion vector
+  mvp_full.row = tmp_mv->as_mv.row * 8;
+  mvp_full.col = tmp_mv->as_mv.col * 8;
+  *rate_mv = vp9_mv_bit_cost(&mvp_full, &ref_mv,
+                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 }
 
 static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                     const TileInfo *const tile,
                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                    MV *tmp_mv, int *rate_mv) {
+                                    MV *tmp_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   int ref = mbmi->ref_frame[0];
   MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
@@ -160,15 +169,13 @@ static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                x->nmvjointcost, x->mvcost,
                                &dis, &x->pred_sse[ref]);
 
-  // calculate the bit cost on motion vector
-  *rate_mv = vp9_mv_bit_cost(tmp_mv, &ref_mv,
-                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-
   if (scaled_ref_frame) {
     int i;
     for (i = 0; i < MAX_MB_PLANE; i++)
       xd->plane[i].pre[0] = backup_yv12[i];
   }
+
+  x->pred_mv[ref].as_mv = *tmp_mv;
 }
 
 static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
@@ -183,14 +190,12 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
 
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
-  const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
 
-  int var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
-                               pd->dst.buf, pd->dst.stride, &sse);
+  int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
+                                  pd->dst.buf, pd->dst.stride, &sse);
 
-  vp9_model_rd_from_var_lapndz(var + sse, 1 << num_pels_log2_lookup[bs],
+  vp9_model_rd_from_var_lapndz(sse + var, 1 << num_pels_log2_lookup[bsize],
                                pd->dequant[1] >> 3, &rate, &dist);
-
   *out_rate_sum = rate;
   *out_dist_sum = dist << 3;
 }
@@ -204,12 +209,12 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                             int64_t *returndistortion,
                             BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
-  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   MB_PREDICTION_MODE this_mode, best_mode = ZEROMV;
   MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
+  INTERP_FILTER best_pred_filter = EIGHTTAP;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
@@ -227,6 +232,13 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                            intra_cost_penalty, 0);
   const int64_t intra_mode_cost = 50;
 
+  unsigned char segment_id = mbmi->segment_id;
+  const int *const rd_threshes = cpi->rd_threshes[segment_id][bsize];
+  const int *const rd_thresh_freq_fact = cpi->rd_thresh_freq_fact[bsize];
+  // Mode index conversion form THR_MODES to MB_PREDICTION_MODE for a ref frame.
+  int mode_idx[MB_MODE_COUNT] = {0};
+  INTERP_FILTER filter_ref = SWITCHABLE;
+
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
   x->skip = 0;
@@ -244,20 +256,24 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   mbmi->interp_filter = cpi->common.interp_filter == SWITCHABLE ?
                         EIGHTTAP : cpi->common.interp_filter;
   mbmi->skip = 0;
-  mbmi->segment_id = 0;
-  xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
+  mbmi->segment_id = segment_id;
 
   for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       vp9_setup_buffer_inter(cpi, x, tile,
-                             ref_frame, block_size, mi_row, mi_col,
+                             ref_frame, bsize, mi_row, mi_col,
                              frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
+  if (xd->up_available)
+    filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
+  else if (xd->left_available)
+    filter_ref = xd->mi[-1]->mbmi.interp_filter;
+
   for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
     if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
       continue;
@@ -270,6 +286,14 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
     mbmi->ref_frame[0] = ref_frame;
 
+    // Set conversion index for LAST_FRAME.
+    if (ref_frame == LAST_FRAME) {
+      mode_idx[NEARESTMV] = THR_NEARESTMV;   // LAST_FRAME, NEARESTMV
+      mode_idx[NEARMV] = THR_NEARMV;         // LAST_FRAME, NEARMV
+      mode_idx[ZEROMV] = THR_ZEROMV;         // LAST_FRAME, ZEROMV
+      mode_idx[NEWMV] = THR_NEWMV;           // LAST_FRAME, NEWMV
+    }
+
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
       int rate_mv = 0;
 
@@ -277,18 +301,29 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           (1 << INTER_OFFSET(this_mode)))
         continue;
 
+      if (best_rd < ((int64_t)rd_threshes[mode_idx[this_mode]] *
+          rd_thresh_freq_fact[this_mode] >> 5) ||
+          rd_threshes[mode_idx[this_mode]] == INT_MAX)
+        continue;
+
       if (this_mode == NEWMV) {
+        int rate_mode = 0;
         if (this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
           continue;
 
         full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
-                                 &frame_mv[NEWMV][ref_frame]);
+                                 &frame_mv[NEWMV][ref_frame], &rate_mv);
 
         if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
           continue;
 
+        rate_mode = x->inter_mode_cost[mbmi->mode_context[ref_frame]]
+                                      [INTER_OFFSET(this_mode)];
+        if (RDCOST(x->rdmult, x->rddiv, rate_mv + rate_mode, 0) > best_rd)
+          continue;
+
         sub_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
-                                &frame_mv[NEWMV][ref_frame].as_mv, &rate_mv);
+                                &frame_mv[NEWMV][ref_frame].as_mv);
       }
 
       if (this_mode != NEARESTMV)
@@ -298,9 +333,63 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
       mbmi->mode = this_mode;
       mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
-      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
 
-      model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist);
+      // Search for the best prediction filter type, when the resulting
+      // motion vector is at sub-pixel accuracy level for luma component, i.e.,
+      // the last three bits are all zeros.
+      if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
+          ((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
+           (mbmi->mv[0].as_mv.col & 0x07) != 0)) {
+        int64_t tmp_rdcost1 = INT64_MAX;
+        int64_t tmp_rdcost2 = INT64_MAX;
+        int64_t tmp_rdcost3 = INT64_MAX;
+        int pf_rate[3];
+        int64_t pf_dist[3];
+
+        mbmi->interp_filter = EIGHTTAP;
+        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP],
+                          &pf_dist[EIGHTTAP]);
+        tmp_rdcost1 = RDCOST(x->rdmult, x->rddiv,
+                             vp9_get_switchable_rate(x) + pf_rate[EIGHTTAP],
+                             pf_dist[EIGHTTAP]);
+
+        mbmi->interp_filter = EIGHTTAP_SHARP;
+        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP_SHARP],
+                          &pf_dist[EIGHTTAP_SHARP]);
+        tmp_rdcost2 = RDCOST(x->rdmult, x->rddiv,
+                          vp9_get_switchable_rate(x) + pf_rate[EIGHTTAP_SHARP],
+                          pf_dist[EIGHTTAP_SHARP]);
+
+        mbmi->interp_filter = EIGHTTAP_SMOOTH;
+        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP_SMOOTH],
+                          &pf_dist[EIGHTTAP_SMOOTH]);
+        tmp_rdcost3 = RDCOST(x->rdmult, x->rddiv,
+                          vp9_get_switchable_rate(x) + pf_rate[EIGHTTAP_SMOOTH],
+                          pf_dist[EIGHTTAP_SMOOTH]);
+
+        if (tmp_rdcost2 < tmp_rdcost1) {
+          if (tmp_rdcost2 < tmp_rdcost3)
+            mbmi->interp_filter = EIGHTTAP_SHARP;
+          else
+            mbmi->interp_filter = EIGHTTAP_SMOOTH;
+        } else {
+          if (tmp_rdcost1 < tmp_rdcost3)
+            mbmi->interp_filter = EIGHTTAP;
+          else
+            mbmi->interp_filter = EIGHTTAP_SMOOTH;
+        }
+
+        rate = pf_rate[mbmi->interp_filter];
+        dist = pf_dist[mbmi->interp_filter];
+      } else {
+        mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref;
+        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist);
+      }
+
       rate += rate_mv;
       rate += x->inter_mode_cost[mbmi->mode_context[ref_frame]]
                                 [INTER_OFFSET(this_mode)];
@@ -311,15 +400,17 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         *returnrate = rate;
         *returndistortion = dist;
         best_mode = this_mode;
+        best_pred_filter = mbmi->interp_filter;
         best_ref_frame = ref_frame;
       }
     }
   }
 
   mbmi->mode = best_mode;
+  mbmi->interp_filter = best_pred_filter;
   mbmi->ref_frame[0] = best_ref_frame;
   mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
-  xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+  xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
 
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c
index 4ab8995..c092ee4 100644
--- a/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libvpx/vp9/encoder/vp9_quantize.c
@@ -153,6 +153,7 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) {
 
 void vp9_init_quantizer(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
+  QUANTS *const quants = &cpi->quants;
   int i, q, quant;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
@@ -163,48 +164,49 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       // y
       quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
                      : vp9_ac_quant(q, 0);
-      invert_quant(&cpi->y_quant[q][i], &cpi->y_quant_shift[q][i], quant);
-      cpi->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
-      cpi->y_round[q][i] = (qrounding_factor * quant) >> 7;
+      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
+      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->y_dequant[q][i] = quant;
 
       // uv
       quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q)
                      : vp9_ac_quant(q, cm->uv_ac_delta_q);
-      invert_quant(&cpi->uv_quant[q][i], &cpi->uv_quant_shift[q][i], quant);
-      cpi->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
-      cpi->uv_round[q][i] = (qrounding_factor * quant) >> 7;
+      invert_quant(&quants->uv_quant[q][i],
+                   &quants->uv_quant_shift[q][i], quant);
+      quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->uv_dequant[q][i] = quant;
 
 #if CONFIG_ALPHA
       // alpha
       quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q)
                      : vp9_ac_quant(q, cm->a_ac_delta_q);
-      invert_quant(&cpi->a_quant[q][i], &cpi->a_quant_shift[q][i], quant);
-      cpi->a_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
-      cpi->a_round[q][i] = (qrounding_factor * quant) >> 7;
+      invert_quant(&quants->a_quant[q][i], &quants->a_quant_shift[q][i], quant);
+      quants->a_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      quants->a_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->a_dequant[q][i] = quant;
 #endif
     }
 
     for (i = 2; i < 8; i++) {
-      cpi->y_quant[q][i] = cpi->y_quant[q][1];
-      cpi->y_quant_shift[q][i] = cpi->y_quant_shift[q][1];
-      cpi->y_zbin[q][i] = cpi->y_zbin[q][1];
-      cpi->y_round[q][i] = cpi->y_round[q][1];
+      quants->y_quant[q][i] = quants->y_quant[q][1];
+      quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
+      quants->y_zbin[q][i] = quants->y_zbin[q][1];
+      quants->y_round[q][i] = quants->y_round[q][1];
       cm->y_dequant[q][i] = cm->y_dequant[q][1];
 
-      cpi->uv_quant[q][i] = cpi->uv_quant[q][1];
-      cpi->uv_quant_shift[q][i] = cpi->uv_quant_shift[q][1];
-      cpi->uv_zbin[q][i] = cpi->uv_zbin[q][1];
-      cpi->uv_round[q][i] = cpi->uv_round[q][1];
+      quants->uv_quant[q][i] = quants->uv_quant[q][1];
+      quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
+      quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
+      quants->uv_round[q][i] = quants->uv_round[q][1];
       cm->uv_dequant[q][i] = cm->uv_dequant[q][1];
 
 #if CONFIG_ALPHA
-      cpi->a_quant[q][i] = cpi->a_quant[q][1];
-      cpi->a_quant_shift[q][i] = cpi->a_quant_shift[q][1];
-      cpi->a_zbin[q][i] = cpi->a_zbin[q][1];
-      cpi->a_round[q][i] = cpi->a_round[q][1];
+      quants->a_quant[q][i] = quants->a_quant[q][1];
+      quants->a_quant_shift[q][i] = quants->a_quant_shift[q][1];
+      quants->a_zbin[q][i] = quants->a_zbin[q][1];
+      quants->a_round[q][i] = quants->a_round[q][1];
       cm->a_dequant[q][i] = cm->a_dequant[q][1];
 #endif
     }
@@ -213,27 +215,28 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
 
 void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   const VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  QUANTS *const quants = &cpi->quants;
+  const int segment_id = xd->mi[0]->mbmi.segment_id;
   const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
   const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
   const int zbin = cpi->zbin_mode_boost + x->act_zbin_adj;
   int i;
 
   // Y
-  x->plane[0].quant = cpi->y_quant[qindex];
-  x->plane[0].quant_shift = cpi->y_quant_shift[qindex];
-  x->plane[0].zbin = cpi->y_zbin[qindex];
-  x->plane[0].round = cpi->y_round[qindex];
+  x->plane[0].quant = quants->y_quant[qindex];
+  x->plane[0].quant_shift = quants->y_quant_shift[qindex];
+  x->plane[0].zbin = quants->y_zbin[qindex];
+  x->plane[0].round = quants->y_round[qindex];
   x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7);
   xd->plane[0].dequant = cm->y_dequant[qindex];
 
   // UV
   for (i = 1; i < 3; i++) {
-    x->plane[i].quant = cpi->uv_quant[qindex];
-    x->plane[i].quant_shift = cpi->uv_quant_shift[qindex];
-    x->plane[i].zbin = cpi->uv_zbin[qindex];
-    x->plane[i].round = cpi->uv_round[qindex];
+    x->plane[i].quant = quants->uv_quant[qindex];
+    x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
+    x->plane[i].zbin = quants->uv_zbin[qindex];
+    x->plane[i].round = quants->uv_round[qindex];
     x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7);
     xd->plane[i].dequant = cm->uv_dequant[qindex];
   }
@@ -273,9 +276,7 @@ void vp9_frame_init_quantizer(VP9_COMP *cpi) {
   vp9_init_plane_quantizers(cpi, &cpi->mb);
 }
 
-void vp9_set_quantizer(struct VP9_COMP *cpi, int q) {
-  VP9_COMMON *const cm = &cpi->common;
-
+void vp9_set_quantizer(VP9_COMMON *cm, int q) {
   // quantizer has to be reinitialized with vp9_init_quantizer() if any
   // delta_q changes.
   cm->base_qindex = q;
diff --git a/libvpx/vp9/encoder/vp9_quantize.h b/libvpx/vp9/encoder/vp9_quantize.h
index f356b12..7d231df 100644
--- a/libvpx/vp9/encoder/vp9_quantize.h
+++ b/libvpx/vp9/encoder/vp9_quantize.h
@@ -17,12 +17,30 @@
 extern "C" {
 #endif
 
+typedef struct {
+  DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+
+  DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
+
+#if CONFIG_ALPHA
+  DECLARE_ALIGNED(16, int16_t, a_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, a_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, a_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, a_round[QINDEX_RANGE][8]);
+#endif
+} QUANTS;
+
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan);
 
 struct VP9_COMP;
-
-void vp9_set_quantizer(struct VP9_COMP *cpi, int q);
+struct VP9Common;
 
 void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
 
@@ -32,6 +50,8 @@ void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x);
 
 void vp9_init_quantizer(struct VP9_COMP *cpi);
 
+void vp9_set_quantizer(struct VP9Common *cm, int q);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c
index edc48bb..3420816 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -35,9 +35,6 @@
 #define MIN_BPB_FACTOR 0.005
 #define MAX_BPB_FACTOR 50
 
-// Bits Per MB at different Q (Multiplied by 512)
-#define BPER_MB_NORMBITS    9
-
 // Tables relating active max Q to active min Q
 static int kf_low_motion_minq[QINDEX_RANGE];
 static int kf_high_motion_minq[QINDEX_RANGE];
@@ -55,10 +52,9 @@ static int kf_low = 400;
 // formulaic approach to facilitate easier adjustment of the Q tables.
 // The formulae were derived from computing a 3rd order polynomial best
 // fit to the original data (after plotting real maxq vs minq (not q index))
-static int calculate_minq_index(double maxq,
-                                double x3, double x2, double x1, double c) {
+static int get_minq_index(double maxq, double x3, double x2, double x1) {
   int i;
-  const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq + c,
+  const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq,
                                 maxq);
 
   // Special case handling to deal with the step from q2.0
@@ -66,57 +62,26 @@ static int calculate_minq_index(double maxq,
   if (minqtarget <= 2.0)
     return 0;
 
-  for (i = 0; i < QINDEX_RANGE; i++) {
+  for (i = 0; i < QINDEX_RANGE; i++)
     if (minqtarget <= vp9_convert_qindex_to_q(i))
       return i;
-  }
 
   return QINDEX_RANGE - 1;
 }
 
-void vp9_rc_init_minq_luts(void) {
+void vp9_rc_init_minq_luts() {
   int i;
 
   for (i = 0; i < QINDEX_RANGE; i++) {
     const double maxq = vp9_convert_qindex_to_q(i);
 
-
-    kf_low_motion_minq[i] = calculate_minq_index(maxq,
-                                                 0.000001,
-                                                 -0.0004,
-                                                 0.15,
-                                                 0.0);
-    kf_high_motion_minq[i] = calculate_minq_index(maxq,
-                                                  0.000002,
-                                                  -0.0012,
-                                                  0.50,
-                                                  0.0);
-
-    gf_low_motion_minq[i] = calculate_minq_index(maxq,
-                                                 0.0000015,
-                                                 -0.0009,
-                                                 0.32,
-                                                 0.0);
-    gf_high_motion_minq[i] = calculate_minq_index(maxq,
-                                                  0.0000021,
-                                                  -0.00125,
-                                                  0.50,
-                                                  0.0);
-    afq_low_motion_minq[i] = calculate_minq_index(maxq,
-                                                  0.0000015,
-                                                  -0.0009,
-                                                  0.33,
-                                                  0.0);
-    afq_high_motion_minq[i] = calculate_minq_index(maxq,
-                                                   0.0000021,
-                                                   -0.00125,
-                                                   0.55,
-                                                   0.0);
-    inter_minq[i] = calculate_minq_index(maxq,
-                                         0.00000271,
-                                         -0.00113,
-                                         0.75,
-                                         0.0);
+    kf_low_motion_minq[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.15);
+    kf_high_motion_minq[i] = get_minq_index(maxq, 0.000002, -0.0012, 0.50);
+    gf_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.32);
+    gf_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.50);
+    afq_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.33);
+    afq_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55);
+    inter_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.75);
   }
 }
 
@@ -138,79 +103,10 @@ int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
   return (int)(0.5 + (enumerator * correction_factor / q));
 }
 
-void vp9_save_coding_context(VP9_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  VP9_COMMON *cm = &cpi->common;
-
-  // Stores a snapshot of key state variables which can subsequently be
-  // restored with a call to vp9_restore_coding_context. These functions are
-  // intended for use in a re-code loop in vp9_compress_frame where the
-  // quantizer value is adjusted between loop iterations.
-  vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
-  vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);
-  vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);
-
-  vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
-
-  vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
-             cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
-
-  vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
-  vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
-
-  cc->fc = cm->fc;
-}
-
-void vp9_restore_coding_context(VP9_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  VP9_COMMON *cm = &cpi->common;
-
-  // Restore key state variables to the snapshot state stored in the
-  // previous call to vp9_save_coding_context.
-  vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
-  vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
-  vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
-
-  vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
-
-  vpx_memcpy(cm->last_frame_seg_map,
-             cpi->coding_context.last_frame_seg_map_copy,
-             (cm->mi_rows * cm->mi_cols));
-
-  vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
-  vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
-
-  cm->fc = cc->fc;
-}
-
-void vp9_setup_key_frame(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  vp9_setup_past_independence(cm);
-
-  /* All buffers are implicitly updated on key frames. */
-  cpi->refresh_golden_frame = 1;
-  cpi->refresh_alt_ref_frame = 1;
-}
-
-void vp9_setup_inter_frame(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  if (cm->error_resilient_mode || cm->intra_only)
-    vp9_setup_past_independence(cm);
-
-  assert(cm->frame_context_idx < FRAME_CONTEXTS);
-  cm->fc = cm->frame_contexts[cm->frame_context_idx];
-}
-
-static int estimate_bits_at_q(int frame_kind, int q, int mbs,
+static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
                               double correction_factor) {
-  const int bpm = (int)(vp9_rc_bits_per_mb(frame_kind, q, correction_factor));
-
-  // Attempt to retain reasonable accuracy without overflow. The cutoff is
-  // chosen such that the maximum product of Bpm and MBs fits 31 bits. The
-  // largest Bpm takes 20 bits.
-  return (mbs > (1 << 11)) ? (bpm >> BPER_MB_NORMBITS) * mbs
-                           : (bpm * mbs) >> BPER_MB_NORMBITS;
+  const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor));
+  return ((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS;
 }
 
 int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
@@ -247,13 +143,12 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) {
 
 
 // Update the buffer level for higher layers, given the encoded current layer.
-static void update_layer_buffer_level(VP9_COMP *const cpi,
-                                      int encoded_frame_size) {
+static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
   int temporal_layer = 0;
-  int current_temporal_layer = cpi->svc.temporal_layer_id;
+  int current_temporal_layer = svc->temporal_layer_id;
   for (temporal_layer = current_temporal_layer + 1;
-      temporal_layer < cpi->svc.number_temporal_layers; ++temporal_layer) {
-    LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer];
+      temporal_layer < svc->number_temporal_layers; ++temporal_layer) {
+    LAYER_CONTEXT *lc = &svc->layer_context[temporal_layer];
     RATE_CONTROL *lrc = &lc->rc;
     int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate -
         encoded_frame_size);
@@ -283,10 +178,60 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
   rc->buffer_level = rc->bits_off_target;
 
   if (cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
-    update_layer_buffer_level(cpi, encoded_frame_size);
+    update_layer_buffer_level(&cpi->svc, encoded_frame_size);
   }
 }
 
+void vp9_rc_init(const VP9_CONFIG *oxcf, int pass, RATE_CONTROL *rc) {
+  if (pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
+    rc->avg_frame_qindex[0] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[1] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[2] = oxcf->worst_allowed_q;
+  } else {
+    rc->avg_frame_qindex[0] = (oxcf->worst_allowed_q +
+                                   oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[1] = (oxcf->worst_allowed_q +
+                                   oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[2] = (oxcf->worst_allowed_q +
+                                   oxcf->best_allowed_q) / 2;
+  }
+
+  rc->last_q[0] = oxcf->best_allowed_q;
+  rc->last_q[1] = oxcf->best_allowed_q;
+  rc->last_q[2] = oxcf->best_allowed_q;
+
+  rc->buffer_level =    oxcf->starting_buffer_level;
+  rc->bits_off_target = oxcf->starting_buffer_level;
+
+  rc->rolling_target_bits      = rc->av_per_frame_bandwidth;
+  rc->rolling_actual_bits      = rc->av_per_frame_bandwidth;
+  rc->long_rolling_target_bits = rc->av_per_frame_bandwidth;
+  rc->long_rolling_actual_bits = rc->av_per_frame_bandwidth;
+
+  rc->total_actual_bits = 0;
+  rc->total_target_vs_actual = 0;
+
+  rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+  rc->frames_since_key = 8;  // Sensible default for first frame.
+  rc->this_key_frame_forced = 0;
+  rc->next_key_frame_forced = 0;
+  rc->source_alt_ref_pending = 0;
+  rc->source_alt_ref_active = 0;
+
+  rc->frames_till_gf_update_due = 0;
+
+  rc->ni_av_qi = oxcf->worst_allowed_q;
+  rc->ni_tot_qi = 0;
+  rc->ni_frames = 0;
+
+  rc->tot_q = 0.0;
+  rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q);
+
+  rc->rate_correction_factor = 1.0;
+  rc->key_frame_rate_correction_factor = 1.0;
+  rc->gf_rate_correction_factor = 1.0;
+}
+
 int vp9_rc_drop_frame(VP9_COMP *cpi) {
   const VP9_CONFIG *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -330,6 +275,7 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) {
     return cpi->rc.key_frame_rate_correction_factor;
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !cpi->rc.is_src_frame_alt_ref &&
         !(cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER))
       return cpi->rc.gf_rate_correction_factor;
     else
@@ -342,6 +288,7 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
     cpi->rc.key_frame_rate_correction_factor = factor;
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !cpi->rc.is_src_frame_alt_ref &&
         !(cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER))
       cpi->rc.gf_rate_correction_factor = factor;
     else
@@ -350,7 +297,7 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
 }
 
 void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
-  const int q = cpi->common.base_qindex;
+  const VP9_COMMON *const cm = &cpi->common;
   int correction_factor = 100;
   double rate_correction_factor = get_rate_correction_factor(cpi);
   double adjustment_limit;
@@ -363,8 +310,8 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
   // Stay in double to avoid int overflow when values are large
-  projected_size_based_on_q = estimate_bits_at_q(cpi->common.frame_type, q,
-                                                 cpi->common.MBs,
+  projected_size_based_on_q = estimate_bits_at_q(cm->frame_type,
+                                                 cm->base_qindex, cm->MBs,
                                                  rate_correction_factor);
   // Work out a size correction factor.
   if (projected_size_based_on_q > 0)
@@ -388,20 +335,18 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
 
   if (correction_factor > 102) {
     // We are not already at the worst allowable quality
-    correction_factor =
-        (int)(100 + ((correction_factor - 100) * adjustment_limit));
-    rate_correction_factor =
-        ((rate_correction_factor * correction_factor) / 100);
+    correction_factor = (int)(100 + ((correction_factor - 100) *
+                                  adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
 
     // Keep rate_correction_factor within limits
     if (rate_correction_factor > MAX_BPB_FACTOR)
       rate_correction_factor = MAX_BPB_FACTOR;
   } else if (correction_factor < 99) {
     // We are not already at the best allowable quality
-    correction_factor =
-        (int)(100 - ((100 - correction_factor) * adjustment_limit));
-    rate_correction_factor =
-        ((rate_correction_factor * correction_factor) / 100);
+    correction_factor = (int)(100 - ((100 - correction_factor) *
+                                  adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
 
     // Keep rate_correction_factor within limits
     if (rate_correction_factor < MIN_BPB_FACTOR)
@@ -422,11 +367,8 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
 
   // Calculate required scaling factor based on target frame size and size of
   // frame produced using previous Q.
-  if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
-    // Case where we would overflow int
-    target_bits_per_mb = (target_bits_per_frame / cm->MBs) << BPER_MB_NORMBITS;
-  else
-    target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs;
+    target_bits_per_mb =
+        ((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs;
 
   i = active_best_quality;
 
@@ -493,6 +435,7 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
   // If buffer is below the optimal level, let the active_worst_quality go from
   // ambient Q (at buffer = optimal level) to worst_quality level
   // (at buffer = critical level).
+  const VP9_COMMON *const cm = &cpi->common;
   const VP9_CONFIG *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
   // Buffer level below which we push active_worst to worst_quality.
@@ -500,9 +443,9 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
   int64_t buff_lvl_step = 0;
   int adjustment = 0;
   int active_worst_quality;
-  if (cpi->common.frame_type == KEY_FRAME)
+  if (cm->frame_type == KEY_FRAME)
     return rc->worst_quality;
-  if (cpi->common.current_video_frame > 1)
+  if (cm->current_video_frame > 1)
     active_worst_quality = MIN(rc->worst_quality,
                                rc->avg_frame_qindex[INTER_FRAME] * 5 / 4);
   else
@@ -556,7 +499,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
     if (rc->this_key_frame_forced) {
       int qindex = rc->last_boosted_qindex;
       double last_boosted_q = vp9_convert_qindex_to_q(qindex);
-      int delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
+      int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
                                             (last_boosted_q * 0.75));
       active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
     } else if (cm->current_video_frame > 0) {
@@ -578,8 +521,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
       q_val = vp9_convert_qindex_to_q(active_best_quality);
-      active_best_quality += vp9_compute_qdelta(cpi, q_val, q_val *
-                                                   q_adj_factor);
+      active_best_quality += vp9_compute_qdelta(rc, q_val,
+                                                q_val * q_adj_factor);
     }
   } else if (!rc->is_src_frame_alt_ref &&
              !cpi->use_svc &&
@@ -635,7 +578,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
                           active_best_quality, active_worst_quality);
     if (q > *top_index) {
       // Special case when we are targeting the max allowed rate
-      if (cpi->rc.this_frame_target >= cpi->rc.max_frame_bandwidth)
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
         *top_index = q;
       else
         q = *top_index;
@@ -668,8 +611,8 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
     if (rc->this_key_frame_forced) {
       int qindex = rc->last_boosted_qindex;
       double last_boosted_q = vp9_convert_qindex_to_q(qindex);
-      int delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
-                                            (last_boosted_q * 0.75));
+      int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                            last_boosted_q * 0.75);
       active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
     } else if (cm->current_video_frame > 0) {
       // not first frame of one pass and kf_boost is set
@@ -690,15 +633,15 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
       q_val = vp9_convert_qindex_to_q(active_best_quality);
-      active_best_quality += vp9_compute_qdelta(cpi, q_val, q_val *
-                                                   q_adj_factor);
+      active_best_quality += vp9_compute_qdelta(rc, q_val,
+                                                q_val * q_adj_factor);
     }
 #else
     double current_q;
     // Force the KF quantizer to be 30% of the active_worst_quality.
     current_q = vp9_convert_qindex_to_q(active_worst_quality);
     active_best_quality = active_worst_quality
-        + vp9_compute_qdelta(cpi, current_q, current_q * 0.3);
+        + vp9_compute_qdelta(rc, current_q, current_q * 0.3);
 #endif
   } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
@@ -801,7 +744,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
                           active_best_quality, active_worst_quality);
     if (q > *top_index) {
       // Special case when we are targeting the max allowed rate
-      if (cpi->rc.this_frame_target >= cpi->rc.max_frame_bandwidth)
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
         *top_index = q;
       else
         q = *top_index;
@@ -817,7 +760,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
     assert(level >= 0);
     new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
     q = active_worst_quality +
-        vp9_compute_qdelta(cpi, current_q, new_q);
+        vp9_compute_qdelta(rc, current_q, new_q);
 
     *bottom_index = q;
     *top_index    = q;
@@ -850,8 +793,8 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
     if (rc->this_key_frame_forced) {
       int qindex = rc->last_boosted_qindex;
       double last_boosted_q = vp9_convert_qindex_to_q(qindex);
-      int delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
-                                            (last_boosted_q * 0.75));
+      int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                            last_boosted_q * 0.75);
       active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
     } else {
       // Not forced keyframe.
@@ -875,15 +818,15 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
       q_val = vp9_convert_qindex_to_q(active_best_quality);
-      active_best_quality += vp9_compute_qdelta(cpi, q_val, q_val *
-                                                   q_adj_factor);
+      active_best_quality += vp9_compute_qdelta(rc, q_val,
+                                                q_val * q_adj_factor);
     }
 #else
     double current_q;
     // Force the KF quantizer to be 30% of the active_worst_quality.
     current_q = vp9_convert_qindex_to_q(active_worst_quality);
     active_best_quality = active_worst_quality
-        + vp9_compute_qdelta(cpi, current_q, current_q * 0.3);
+        + vp9_compute_qdelta(rc, current_q, current_q * 0.3);
 #endif
   } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
@@ -984,7 +927,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
                           active_best_quality, active_worst_quality);
     if (q > *top_index) {
       // Special case when we are targeting the max allowed rate.
-      if (cpi->rc.this_frame_target >= cpi->rc.max_frame_bandwidth)
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
         *top_index = q;
       else
         q = *top_index;
@@ -1000,7 +943,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
     assert(level >= 0);
     new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
     q = active_worst_quality +
-        vp9_compute_qdelta(cpi, current_q, new_q);
+        vp9_compute_qdelta(rc, current_q, new_q);
 
     *bottom_index = q;
     *top_index    = q;
@@ -1016,8 +959,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
 }
 
 int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi,
-                             int *bottom_index,
-                             int *top_index) {
+                             int *bottom_index, int *top_index) {
   int q;
   if (cpi->pass == 0) {
     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
@@ -1028,14 +970,14 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi,
     q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
   }
 
-  // JBB : This is realtime mode.  In real time mode the first frame
-  // should be larger. Q of 0 is disabled because we force tx size to be
+  // Q of 0 is disabled because we force tx size to be
   // 16x16...
   if (cpi->sf.use_nonrd_pick_mode) {
-    if (cpi->common.current_video_frame == 0)
-      q /= 3;
     if (q == 0)
       q++;
+    if (cpi->sf.force_frame_boost == 1)
+      q -= cpi->sf.max_delta_qindex;
+
     if (q < *bottom_index)
       *bottom_index = q;
     else if (q > *top_index)
@@ -1053,28 +995,14 @@ void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi,
     *frame_under_shoot_limit = 0;
     *frame_over_shoot_limit  = INT_MAX;
   } else {
-    if (cpi->common.frame_type == KEY_FRAME) {
-      *frame_over_shoot_limit  = this_frame_target * 9 / 8;
-      *frame_under_shoot_limit = this_frame_target * 7 / 8;
-    } else {
-      if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) {
-        *frame_over_shoot_limit  = this_frame_target * 9 / 8;
-        *frame_under_shoot_limit = this_frame_target * 7 / 8;
-      } else {
-        // Stron overshoot limit for constrained quality
-        if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-          *frame_over_shoot_limit  = this_frame_target * 11 / 8;
-          *frame_under_shoot_limit = this_frame_target * 2 / 8;
-        } else {
-          *frame_over_shoot_limit  = this_frame_target * 11 / 8;
-          *frame_under_shoot_limit = this_frame_target * 5 / 8;
-        }
-      }
-    }
+    int recode_tolerance =
+      (cpi->sf.recode_tolerance * this_frame_target) / 100;
+
+    *frame_over_shoot_limit = this_frame_target + recode_tolerance;
+    *frame_under_shoot_limit = this_frame_target - recode_tolerance;
 
     // For very small rate targets where the fractional adjustment
-    // (eg * 7/8) may be tiny make sure there is at least a minimum
-    // range.
+    // may be tiny make sure there is at least a minimum range.
     *frame_over_shoot_limit += 200;
     *frame_under_shoot_limit -= 200;
     if (*frame_under_shoot_limit < 0)
@@ -1099,16 +1027,17 @@ void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) {
 
 static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
   // this frame refreshes means next frames don't unless specified by user
-  cpi->rc.frames_since_golden = 0;
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->frames_since_golden = 0;
 
 #if CONFIG_MULTIPLE_ARF
   if (!cpi->multi_arf_enabled)
 #endif
     // Clear the alternate reference update pending flag.
-    cpi->rc.source_alt_ref_pending = 0;
+    rc->source_alt_ref_pending = 0;
 
   // Set the alternate reference frame active flag
-  cpi->rc.source_alt_ref_active = 1;
+  rc->source_alt_ref_active = 1;
 }
 
 static void update_golden_frame_stats(VP9_COMP *cpi) {
@@ -1137,6 +1066,7 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
 
 void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   VP9_COMMON *const cm = &cpi->common;
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
 
   cm->last_frame_type = cm->frame_type;
@@ -1146,7 +1076,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   // Post encode loop adjustment of Q prediction.
   vp9_rc_update_rate_correction_factors(
       cpi, (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF ||
-            cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
+            oxcf->end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
 
   // Keep a record of last Q and ambient average Q.
   if (cm->frame_type == KEY_FRAME) {
@@ -1155,7 +1085,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
         3 * rc->avg_frame_qindex[KEY_FRAME] + cm->base_qindex, 2);
   } else if (!rc->is_src_frame_alt_ref &&
       (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) &&
-      !(cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) {
+      !(cpi->use_svc && oxcf->end_usage == USAGE_STREAM_FROM_SERVER)) {
     rc->last_q[2] = cm->base_qindex;
     rc->avg_frame_qindex[2] = ROUND_POWER_OF_TWO(
         3 * rc->avg_frame_qindex[2] + cm->base_qindex, 2);
@@ -1201,12 +1131,11 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
 
   // Actual bits spent
   rc->total_actual_bits += rc->projected_frame_size;
+  rc->total_target_bits += (cm->show_frame ? rc->av_per_frame_bandwidth : 0);
 
-  // Debug stats
-  rc->total_target_vs_actual += (rc->this_frame_target -
-                                 rc->projected_frame_size);
+  rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
 
-  if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame &&
+  if (oxcf->play_alternate && cpi->refresh_alt_ref_frame &&
       (cm->frame_type != KEY_FRAME))
     // Update the alternate reference frame stats as appropriate.
     update_alt_ref_frame_stats(cpi);
@@ -1239,15 +1168,15 @@ static int test_for_kf_one_pass(VP9_COMP *cpi) {
 
 static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
   static const int af_ratio = 10;
-  const RATE_CONTROL *rc = &cpi->rc;
+  const RATE_CONTROL *const rc = &cpi->rc;
   int target;
 #if USE_ALTREF_FOR_ONE_PASS
   target = (!rc->is_src_frame_alt_ref &&
             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) ?
-      (rc->av_per_frame_bandwidth * cpi->rc.baseline_gf_interval * af_ratio) /
-      (cpi->rc.baseline_gf_interval + af_ratio - 1) :
-      (rc->av_per_frame_bandwidth * cpi->rc.baseline_gf_interval) /
-      (cpi->rc.baseline_gf_interval + af_ratio - 1);
+      (rc->av_per_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
+      (rc->baseline_gf_interval + af_ratio - 1) :
+      (rc->av_per_frame_bandwidth * rc->baseline_gf_interval) /
+      (rc->baseline_gf_interval + af_ratio - 1);
 #else
   target = rc->av_per_frame_bandwidth;
 #endif
@@ -1299,18 +1228,19 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
 static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   const VP9_CONFIG *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
+  const SVC *const svc = &cpi->svc;
   const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level;
   const int64_t one_pct_bits = 1 + oxcf->optimal_buffer_level / 100;
   int min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4,
                              FRAME_OVERHEAD_BITS);
   int target = rc->av_per_frame_bandwidth;
-  if (cpi->svc.number_temporal_layers > 1 &&
-      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+  if (svc->number_temporal_layers > 1 &&
+      oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
     // Note that for layers, av_per_frame_bandwidth is the cumulative
     // per-frame-bandwidth. For the target size of this frame, use the
     // layer average frame size (i.e., non-cumulative per-frame-bw).
-    int current_temporal_layer = cpi->svc.temporal_layer_id;
-    const LAYER_CONTEXT *lc = &cpi->svc.layer_context[current_temporal_layer];
+    int current_temporal_layer = svc->temporal_layer_id;
+    const LAYER_CONTEXT *lc = &svc->layer_context[current_temporal_layer];
     target = lc->avg_frame_size;
     min_frame_target = MAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
   }
@@ -1347,13 +1277,14 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
 
 void vp9_rc_get_svc_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  int target = cpi->rc.av_per_frame_bandwidth;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target = rc->av_per_frame_bandwidth;
   if ((cm->current_video_frame == 0) ||
       (cm->frame_flags & FRAMEFLAGS_KEY) ||
-      (cpi->oxcf.auto_key && (cpi->rc.frames_since_key %
+      (cpi->oxcf.auto_key && (rc->frames_since_key %
                               cpi->key_frame_frequency == 0))) {
     cm->frame_type = KEY_FRAME;
-    cpi->rc.source_alt_ref_active = 0;
+    rc->source_alt_ref_active = 0;
     if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
       target = calc_iframe_target_size_one_pass_cbr(cpi);
     }
@@ -1364,8 +1295,8 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
     }
   }
   vp9_rc_set_frame_target(cpi, target);
-  cpi->rc.frames_till_gf_update_due = INT_MAX;
-  cpi->rc.baseline_gf_interval = INT_MAX;
+  rc->frames_till_gf_update_due = INT_MAX;
+  rc->baseline_gf_interval = INT_MAX;
 }
 
 void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
@@ -1392,3 +1323,46 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
   rc->frames_till_gf_update_due = INT_MAX;
   rc->baseline_gf_interval = INT_MAX;
 }
+
+int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget) {
+  int start_index = rc->worst_quality;
+  int target_index = rc->worst_quality;
+  int i;
+
+  // Convert the average q value to an index.
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    start_index = i;
+    if (vp9_convert_qindex_to_q(i) >= qstart)
+      break;
+  }
+
+  // Convert the q target to an index
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    target_index = i;
+    if (vp9_convert_qindex_to_q(i) >= qtarget)
+      break;
+  }
+
+  return target_index - start_index;
+}
+
+int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio) {
+  int target_index = rc->worst_quality;
+  int i;
+
+  // Look up the current projected bits per block for the base index
+  const int base_bits_per_mb = vp9_rc_bits_per_mb(frame_type, qindex, 1.0);
+
+  // Find the target bits per mb based on the base value and given ratio.
+  const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+
+  // Convert the q target to an index
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    target_index = i;
+    if (vp9_rc_bits_per_mb(frame_type, i, 1.0) <= target_bits_per_mb )
+      break;
+  }
+
+  return target_index - qindex;
+}
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.h b/libvpx/vp9/encoder/vp9_ratectrl.h
index ed6266f..7693c2b 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -22,6 +22,9 @@ extern "C" {
 
 #define FRAME_OVERHEAD_BITS 200
 
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS    9
+
 typedef struct {
   // Rate targetting variables
   int this_frame_target;
@@ -58,7 +61,7 @@ typedef struct {
   int ni_av_qi;
   int ni_tot_qi;
   int ni_frames;
-  int avg_frame_qindex[3];  // 0 - KEY, 1 - INTER, 2 - ARF/GF
+  int avg_frame_qindex[3];        // 0 - KEY, 1 - INTER, 2 - ARF/GF
   double tot_q;
   double avg_q;
 
@@ -75,7 +78,8 @@ typedef struct {
   int long_rolling_actual_bits;
 
   int64_t total_actual_bits;
-  int total_target_vs_actual;        // debug stats
+  int64_t total_target_bits;
+  int64_t total_target_vs_actual;
 
   int worst_quality;
   int best_quality;
@@ -83,17 +87,13 @@ typedef struct {
 } RATE_CONTROL;
 
 struct VP9_COMP;
+struct VP9_CONFIG;
 
-void vp9_save_coding_context(struct VP9_COMP *cpi);
-void vp9_restore_coding_context(struct VP9_COMP *cpi);
-
-void vp9_setup_key_frame(struct VP9_COMP *cpi);
-void vp9_setup_inter_frame(struct VP9_COMP *cpi);
+void vp9_rc_init(const struct VP9_CONFIG *oxcf, int pass, RATE_CONTROL *rc);
 
 double vp9_convert_qindex_to_q(int qindex);
 
-// initialize luts for minq
-void vp9_rc_init_minq_luts(void);
+void vp9_rc_init_minq_luts();
 
 // Generally at the high level, the following flow is expected
 // to be enforced for rate control:
@@ -166,6 +166,15 @@ int vp9_rc_clamp_pframe_target_size(const struct VP9_COMP *const cpi,
 // This function is called only from the vp9_rc_get_..._params() functions.
 void vp9_rc_set_frame_target(struct VP9_COMP *cpi, int target);
 
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target q value
+int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to the given rate ratio.
+int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index 2fd25ef..dcd2852 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -244,7 +244,6 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
 
 static void set_block_thresholds(VP9_COMP *cpi) {
   const VP9_COMMON *const cm = &cpi->common;
-  const SPEED_FEATURES *const sf = &cpi->sf;
   int i, bsize, segment_id;
 
   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
@@ -261,13 +260,13 @@ static void set_block_thresholds(VP9_COMP *cpi) {
 
       for (i = 0; i < MAX_MODES; ++i)
         cpi->rd_threshes[segment_id][bsize][i] =
-            sf->thresh_mult[i] < thresh_max ? sf->thresh_mult[i] * t / 4
+            cpi->rd_thresh_mult[i] < thresh_max ? cpi->rd_thresh_mult[i] * t / 4
                                             : INT_MAX;
 
       for (i = 0; i < MAX_REFS; ++i) {
         cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
-            sf->thresh_mult_sub8x8[i] < thresh_max
-                ? sf->thresh_mult_sub8x8[i] * t / 4
+            cpi->rd_thresh_mult_sub8x8[i] < thresh_max
+                ? cpi->rd_thresh_mult_sub8x8[i] * t / 4
                 : INT_MAX;
       }
     }
@@ -433,7 +432,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
   int i;
   int64_t rate_sum = 0;
   int64_t dist_sum = 0;
-  const int ref = xd->mi_8x8[0]->mbmi.ref_frame[0];
+  const int ref = xd->mi[0]->mbmi.ref_frame[0];
   unsigned int sse;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -557,7 +556,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
                               const int16_t *scan, const int16_t *nb,
                               int use_fast_coef_costing) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const struct macroblock_plane *p = &x->plane[plane];
   const struct macroblockd_plane *pd = &xd->plane[plane];
   const PLANE_TYPE type = pd->plane_type;
@@ -566,7 +565,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
   const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
                    x->token_costs[tx_size][type][is_inter_block(mbmi)];
-  uint8_t *p_tok = x->token_cache;
+  uint8_t token_cache[32 * 32];
   int pt = combine_entropy_contexts(*A, *L);
   int c, cost;
   // Check for consistency of tx_size with mode info
@@ -584,7 +583,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
     int v = qcoeff[0];
     int prev_t = vp9_dct_value_tokens_ptr[v].token;
     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
-    p_tok[0] = vp9_pt_energy_class[prev_t];
+    token_cache[0] = vp9_pt_energy_class[prev_t];
     ++token_costs;
 
     // ac tokens
@@ -597,9 +596,9 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
       if (use_fast_coef_costing) {
         cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
       } else {
-        pt = get_coef_context(nb, p_tok, c);
+        pt = get_coef_context(nb, token_cache, c);
         cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
-        p_tok[rc] = vp9_pt_energy_class[t];
+        token_cache[rc] = vp9_pt_energy_class[t];
       }
       prev_t = t;
       if (!--band_left) {
@@ -613,7 +612,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
       if (use_fast_coef_costing) {
         cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
       } else {
-        pt = get_coef_context(nb, p_tok, c);
+        pt = get_coef_context(nb, token_cache, c);
         cost += (*token_costs)[0][pt][EOB_TOKEN];
       }
     }
@@ -639,7 +638,7 @@ static void dist_block(int plane, int block, TX_SIZE tx_size,
                                &this_sse) >> shift;
   args->sse  = this_sse >> shift;
 
-  if (x->skip_encode && !is_inter_block(&xd->mi_8x8[0]->mbmi)) {
+  if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) {
     // TODO(jingning): tune the model to better capture the distortion.
     int64_t p = (pd->dequant[1] * pd->dequant[1] *
                     (1 << ss_txfrm_size)) >> (shift + 2);
@@ -664,7 +663,7 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   int64_t rd1, rd2, rd;
 
   if (args->skip)
@@ -750,7 +749,7 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
   args.use_fast_coef_costing = use_fast_coef_casting;
 
   if (plane == 0)
-    xd->mi_8x8[0]->mbmi.tx_size = tx_size;
+    xd->mi[0]->mbmi.tx_size = tx_size;
 
   vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
@@ -780,7 +779,7 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
   VP9_COMMON *const cm = &cpi->common;
   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
   mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
 
@@ -799,7 +798,7 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX},
@@ -882,7 +881,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX},
@@ -952,7 +951,7 @@ static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   int r[TX_SIZES][2], s[TX_SIZES];
   int64_t d[TX_SIZES], sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   TX_SIZE tx_size;
 
@@ -995,7 +994,7 @@ static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                   int64_t ref_best_rd) {
   int64_t sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
   assert(bs == mbmi->sb_type);
   if (cpi->sf.tx_size_search_method != USE_FULL_RD) {
@@ -1071,7 +1070,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 
   vpx_memcpy(ta, a, sizeof(ta));
   vpx_memcpy(tl, l, sizeof(tl));
-  xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
+  xd->mi[0]->mbmi.tx_size = TX_4X4;
 
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
@@ -1100,7 +1099,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
                                                             p->src_diff);
         int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
-        xd->mi_8x8[0]->bmi[block].as_mode = mode;
+        xd->mi[0]->bmi[block].as_mode = mode;
         vp9_predict_intra_block(xd, block, 1,
                                 TX_4X4, mode,
                                 x->skip_encode ? src : dst,
@@ -1173,10 +1172,10 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
                                             int64_t best_rd) {
   int i, j;
   const MACROBLOCKD *const xd = &mb->e_mbd;
-  MODE_INFO *const mic = xd->mi_8x8[0];
-  const MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL;
-  const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type;
+  MODE_INFO *const mic = xd->mi[0];
+  const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
+  const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
@@ -1243,7 +1242,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE mode_selected = DC_PRED;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mi_8x8[0];
+  MODE_INFO *const mic = xd->mi[0];
   int this_rate, this_rate_tokenonly, s;
   int64_t this_distortion, this_rd;
   TX_SIZE best_tx = TX_4X4;
@@ -1257,8 +1256,8 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   /* Y Search for intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int64_t local_tx_cache[TX_MODES];
-    MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
-    MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL;
+    MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
+    MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
 
     if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode)))
       continue;
@@ -1312,7 +1311,7 @@ static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
                              int64_t *sse, BLOCK_SIZE bsize,
                              int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
   int plane;
   int pnrate = 0, pnskip = 1;
@@ -1369,7 +1368,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
 
-    xd->mi_8x8[0]->mbmi.uv_mode = mode;
+    xd->mi[0]->mbmi.uv_mode = mode;
 
     super_block_uvrd(cpi, x, &this_rate_tokenonly,
                      &this_distortion, &s, &this_sse, bsize, best_rd);
@@ -1410,7 +1409,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  xd->mi_8x8[0]->mbmi.uv_mode = mode_selected;
+  xd->mi[0]->mbmi.uv_mode = mode_selected;
   return best_rd;
 }
 
@@ -1421,7 +1420,7 @@ static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
   const VP9_COMMON *cm = &cpi->common;
   int64_t unused;
 
-  x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
+  x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
                    skippable, &unused, bsize, INT64_MAX);
   *rate = *rate_tokenonly + x->intra_uv_mode_cost[cm->frame_type][DC_PRED];
@@ -1447,13 +1446,13 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
                             bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
   }
-  *mode_uv = x->e_mbd.mi_8x8[0]->mbmi.uv_mode;
+  *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
 }
 
-static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
+static int cost_mv_ref(const VP9_COMP *cpi, MB_PREDICTION_MODE mode,
                        int mode_context) {
-  MACROBLOCK *const x = &cpi->mb;
-  const int segment_id = x->e_mbd.mi_8x8[0]->mbmi.segment_id;
+  const MACROBLOCK *const x = &cpi->mb;
+  const int segment_id = x->e_mbd.mi[0]->mbmi.segment_id;
 
   // Don't account for mode here if segment skip is enabled.
   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
@@ -1478,7 +1477,7 @@ static int labels2mode(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
                        int_mv seg_mvs[MAX_REF_FRAMES],
                        int_mv *best_ref_mv[2],
                        const int *mvjcost, int *mvcost[2]) {
-  MODE_INFO *const mic = xd->mi_8x8[0];
+  MODE_INFO *const mic = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mic->mbmi;
   int thismvcost = 0;
   int idx, idy;
@@ -1546,7 +1545,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   MACROBLOCKD *xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
-  MODE_INFO *const mi = xd->mi_8x8[0];
+  MODE_INFO *const mi = xd->mi[0];
   const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
   const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
@@ -1560,6 +1559,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   int thisrate = 0, ref;
   const scan_order *so = &vp9_default_scan_orders[TX_4X4];
   const int is_compound = has_second_ref(&mi->mbmi);
+  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
                                                pd->pre[ref].stride)];
@@ -1567,7 +1568,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
                               &xd->block_refs[ref]->sf, width, height, ref,
-                              xd->interp_kernel, MV_PRECISION_Q3,
+                              kernel, MV_PRECISION_Q3,
                               mi_col * MI_SIZE + 4 * (i % 2),
                               mi_row * MI_SIZE + 4 * (i / 2));
   }
@@ -1643,7 +1644,7 @@ static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
 }
 
 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
-  MB_MODE_INFO *const mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
 
@@ -1658,7 +1659,7 @@ static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
 
 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
                                   struct buf_2d orig_pre[2]) {
-  MB_MODE_INFO *mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
   x->plane[0].src = orig_src;
   x->e_mbd.plane[0].pre[0] = orig_pre[0];
   if (has_second_ref(mbmi))
@@ -1669,6 +1670,45 @@ static INLINE int mv_has_subpel(const MV *mv) {
   return (mv->row & 0x0F) || (mv->col & 0x0F);
 }
 
+// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
+// TODO(aconverse): Find out if this is still productive then clean up or remove
+static int check_best_zero_mv(
+    const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
+    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+    int disable_inter_mode_mask, int this_mode, int ref_frame,
+    int second_ref_frame) {
+  if (!(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) &&
+      (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
+      frame_mv[this_mode][ref_frame].as_int == 0 &&
+      (second_ref_frame == NONE ||
+       frame_mv[this_mode][second_ref_frame].as_int == 0)) {
+    int rfc = mode_context[ref_frame];
+    int c1 = cost_mv_ref(cpi, NEARMV, rfc);
+    int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
+    int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+
+    if (this_mode == NEARMV) {
+      if (c1 > c3) return 0;
+    } else if (this_mode == NEARESTMV) {
+      if (c2 > c3) return 0;
+    } else {
+      assert(this_mode == ZEROMV);
+      if (second_ref_frame == NONE) {
+        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frame].as_int == 0) ||
+            (c3 >= c1 && frame_mv[NEARMV][ref_frame].as_int == 0))
+          return 0;
+      } else {
+        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frame].as_int == 0 &&
+             frame_mv[NEARESTMV][second_ref_frame].as_int == 0) ||
+            (c3 >= c1 && frame_mv[NEARMV][ref_frame].as_int == 0 &&
+             frame_mv[NEARMV][second_ref_frame].as_int == 0))
+          return 0;
+      }
+    }
+  }
+  return 1;
+}
+
 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     const TileInfo *const tile,
                                     BEST_SEG_INFO *bsi_buf, int filter_idx,
@@ -1679,7 +1719,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   MB_PREDICTION_MODE this_mode;
   MACROBLOCKD *xd = &x->e_mbd;
   VP9_COMMON *cm = &cpi->common;
-  MODE_INFO *mi = xd->mi_8x8[0];
+  MODE_INFO *mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
@@ -1737,43 +1777,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         if (disable_inter_mode_mask & (1 << mode_idx))
           continue;
 
-        // if we're near/nearest and mv == 0,0, compare to zeromv
-        if (!(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) &&
-            (this_mode == NEARMV || this_mode == NEARESTMV ||
-             this_mode == ZEROMV) &&
-            frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 &&
-            (!has_second_rf ||
-             frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) {
-          int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
-          int c1 = cost_mv_ref(cpi, NEARMV, rfc);
-          int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
-          int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
-
-          if (this_mode == NEARMV) {
-            if (c1 > c3)
-              continue;
-          } else if (this_mode == NEARESTMV) {
-            if (c2 > c3)
-              continue;
-          } else {
-            assert(this_mode == ZEROMV);
-            if (!has_second_rf) {
-              if ((c3 >= c2 &&
-                   frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
-                  (c3 >= c1 &&
-                   frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
-                continue;
-            } else {
-              if ((c3 >= c2 &&
-                   frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
-                   frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
-                  (c3 >= c1 &&
-                   frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
-                   frame_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
-                continue;
-            }
-          }
-        }
+        if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
+                                disable_inter_mode_mask,
+                                this_mode, mbmi->ref_frame[0],
+                                mbmi->ref_frame[1]))
+          continue;
 
         vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
@@ -2090,7 +2098,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   int i;
   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
   MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi_8x8[0];
+  MODE_INFO *mi = xd->mi[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
   int mode_idx;
 
@@ -2137,7 +2145,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                     uint8_t *ref_y_buffer, int ref_y_stride,
                     int ref_frame, BLOCK_SIZE block_size ) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   int_mv this_mv;
   int i;
   int zero_seen = 0;
@@ -2267,7 +2275,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   // restored if we decide to encode this way
   ctx->skip = x->skip;
   ctx->best_mode_index = mode_index;
-  ctx->mic = *xd->mi_8x8[0];
+  ctx->mic = *xd->mi[0];
 
   ctx->best_ref_mv[0].as_int = ref_mv->as_int;
   ctx->best_ref_mv[1].as_int = second_ref_mv->as_int;
@@ -2318,7 +2326,7 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
   const VP9_COMMON *cm = &cpi->common;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mi = xd->mi_8x8[0];
+  MODE_INFO *const mi = xd->mi[0];
   int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
 
@@ -2350,9 +2358,9 @@ const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
   return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL;
 }
 
-static INLINE int get_switchable_rate(const MACROBLOCK *x) {
+int vp9_get_switchable_rate(const MACROBLOCK *x) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   return SWITCHABLE_INTERP_RATE_FACTOR *
              x->switchable_interp_costs[ctx][mbmi->interp_filter];
@@ -2365,7 +2373,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                  int_mv *tmp_mv, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   VP9_COMMON *cm = &cpi->common;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   int bestsme = INT_MAX;
   int further_steps, step_param;
@@ -2531,13 +2539,14 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const int refs[2] = { mbmi->ref_frame[0],
                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   int_mv ref_mv[2];
   int ite, ref;
   // Prediction buffer from second frame.
   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+  const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
   // Do joint motion search in compound mode to get more accurate mv.
   struct buf_2d backup_yv12[2][MAX_MB_PLANE];
@@ -2591,7 +2600,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                               &frame_mv[refs[!id]].as_mv,
                               &xd->block_refs[!id]->sf,
                               pw, ph, 0,
-                              xd->interp_kernel, MV_PRECISION_Q3,
+                              kernel, MV_PRECISION_Q3,
                               mi_col * MI_SIZE, mi_row * MI_SIZE);
 
     // Compound motion search on first ref frame.
@@ -2692,7 +2701,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  const int64_t ref_best_rd) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const int is_comp_pred = has_second_ref(mbmi);
   const int num_refs = is_comp_pred ? 2 : 1;
   const int this_mode = mbmi->mode;
@@ -2744,7 +2753,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         return INT64_MAX;
       *rate2 += rate_mv;
       frame_mv[refs[0]].as_int =
-          xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+          xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
       single_newmv[refs[0]].as_int = tmp_mv.as_int;
     }
   }
@@ -2806,8 +2815,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         int j;
         int64_t rs_rd;
         mbmi->interp_filter = i;
-        xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
-        rs = get_switchable_rate(x);
+        rs = vp9_get_switchable_rate(x);
         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
 
         if (i > 0 && intpel_mv) {
@@ -2877,8 +2885,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // Set the appropriate filter
   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
       cm->interp_filter : *best_filter;
-  xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
-  rs = cm->interp_filter == SWITCHABLE ? get_switchable_rate(x) : 0;
+  rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(x) : 0;
 
   if (pred_exists) {
     if (best_needs_copy) {
@@ -2908,7 +2915,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (cm->interp_filter == SWITCHABLE)
-    *rate2 += get_switchable_rate(x);
+    *rate2 += vp9_get_switchable_rate(x);
 
   if (!is_comp_pred) {
     if (!x->in_active_map) {
@@ -3066,7 +3073,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   TX_SIZE max_uv_tx_size;
   x->skip_encode = 0;
   ctx->skip = 0;
-  xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+  xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
 
   if (bsize >= BLOCK_8X8) {
     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
@@ -3075,7 +3082,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       *returnrate = INT_MAX;
       return;
     }
-    max_uv_tx_size = get_uv_tx_size_impl(xd->mi_8x8[0]->mbmi.tx_size, bsize);
+    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize);
     rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
                             &dist_uv, &uv_skip, bsize, max_uv_tx_size);
   } else {
@@ -3085,7 +3092,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       *returnrate = INT_MAX;
       return;
     }
-    max_uv_tx_size = get_uv_tx_size_impl(xd->mi_8x8[0]->mbmi.tx_size, bsize);
+    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize);
     rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
                             &dist_uv, &uv_skip, BLOCK_8X8, max_uv_tx_size);
   }
@@ -3108,7 +3115,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       }
   }
 
-  ctx->mic = *xd->mi_8x8[0];
+  ctx->mic = *xd->mi[0];
 }
 
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
@@ -3121,9 +3128,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const struct segmentation *const seg = &cm->seg;
-  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   MB_PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
@@ -3189,7 +3195,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       vp9_setup_buffer_inter(cpi, x, tile,
-                             ref_frame, block_size, mi_row, mi_col,
+                             ref_frame, bsize, mi_row, mi_col,
                              frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
@@ -3371,46 +3377,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         }
       }
     } else {
-      // TODO(aconverse): Find out if this is still productive then clean up or
-      // remove
-      // if we're near/nearest and mv == 0,0, compare to zeromv
       if (x->in_active_map &&
-          !(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) &&
-          (this_mode == NEARMV || this_mode == NEARESTMV ||
-          this_mode == ZEROMV) &&
-          frame_mv[this_mode][ref_frame].as_int == 0 &&
-          !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
-          (!comp_pred || frame_mv[this_mode][second_ref_frame].as_int == 0)) {
-        int rfc = mbmi->mode_context[ref_frame];
-        int c1 = cost_mv_ref(cpi, NEARMV, rfc);
-        int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
-        int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
-
-        if (this_mode == NEARMV) {
-          if (c1 > c3)
-            continue;
-        } else if (this_mode == NEARESTMV) {
-          if (c2 > c3)
-            continue;
-        } else {
-          assert(this_mode == ZEROMV);
-          if (!comp_pred) {
-            if ((c3 >= c2 &&
-                 frame_mv[NEARESTMV][ref_frame].as_int == 0) ||
-                (c3 >= c1 &&
-                 frame_mv[NEARMV][ref_frame].as_int == 0))
-              continue;
-          } else {
-            if ((c3 >= c2 &&
-                 frame_mv[NEARESTMV][ref_frame].as_int == 0 &&
-                 frame_mv[NEARESTMV][second_ref_frame].as_int == 0) ||
-                (c3 >= c1 &&
-                 frame_mv[NEARMV][ref_frame].as_int == 0 &&
-                 frame_mv[NEARMV][second_ref_frame].as_int == 0))
-              continue;
-          }
-        }
-      }
+          !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+        if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
+                                disable_inter_mode_mask, this_mode, ref_frame,
+                                second_ref_frame))
+          continue;
     }
 
     mbmi->mode = this_mode;
@@ -3423,7 +3395,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                                           : cm->interp_filter;
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
-    xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
     // Select prediction reference frames.
     for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -3788,9 +3759,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                                       int64_t best_rd_so_far) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const struct segmentation *seg = &cm->seg;
-  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i;
@@ -3850,7 +3820,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       vp9_setup_buffer_inter(cpi, x, tile,
-                             ref_frame, block_size, mi_row, mi_col,
+                             ref_frame, bsize, mi_row, mi_col,
                              frame_mv[NEARESTMV], frame_mv[NEARMV],
                              yv12_mb);
     }
@@ -3968,7 +3938,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     // them for this frame.
     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
                                                           : cm->interp_filter;
-    xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
     if (comp_pred) {
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
@@ -4067,7 +4036,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
           cpi->rd_thresh_sub8x8[segment_id][bsize][THR_ALTR];
       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
           cpi->rd_thresh_sub8x8[segment_id][bsize][THR_GOLD] : this_rd_thresh;
-      xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
+      xd->mi[0]->mbmi.tx_size = TX_4X4;
 
       cpi->mask_filter_rd = 0;
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
@@ -4091,7 +4060,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
             int newbest, rs;
             int64_t rs_rd;
             mbmi->interp_filter = switchable_filter_index;
-            xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
             tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
                                                  &mbmi->ref_mvs[ref_frame][0],
                                                  second_ref,
@@ -4104,7 +4072,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
             if (tmp_rd == INT64_MAX)
               continue;
-            rs = get_switchable_rate(x);
+            rs = vp9_get_switchable_rate(x);
             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
             cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
             cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
@@ -4131,7 +4099,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
               tmp_best_skippable = skippable;
               tmp_best_mbmode = *mbmi;
               for (i = 0; i < 4; i++) {
-                tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
+                tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
               }
               pred_exists = 1;
@@ -4156,7 +4124,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
       mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
                              tmp_best_filter : cm->interp_filter);
-      xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
         // switchable list (bilinear, 6-tap) is indicated at the frame level
@@ -4179,14 +4146,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         skippable = tmp_best_skippable;
         *mbmi = tmp_best_mbmode;
         for (i = 0; i < 4; i++)
-          xd->mi_8x8[0]->bmi[i] = tmp_best_bmodes[i];
+          xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
       }
 
       rate2 += rate;
       distortion2 += distortion;
 
       if (cm->interp_filter == SWITCHABLE)
-        rate2 += get_switchable_rate(x);
+        rate2 += vp9_get_switchable_rate(x);
 
       if (!mode_excluded)
         mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
@@ -4263,8 +4230,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     // Keep record of best inter rd with single reference
-    if (is_inter_block(&xd->mi_8x8[0]->mbmi) &&
-        !has_second_ref(&xd->mi_8x8[0]->mbmi) &&
+    if (is_inter_block(&xd->mi[0]->mbmi) &&
+        !has_second_ref(&xd->mi[0]->mbmi) &&
         !mode_excluded &&
         this_rd < best_inter_rd) {
       best_inter_rd = this_rd;
@@ -4304,7 +4271,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                    sizeof(uint8_t) * ctx->num_4x4_blk);
 
         for (i = 0; i < 4; i++)
-          best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
+          best_bmodes[i] = xd->mi[0]->bmi[i];
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
@@ -4453,13 +4420,13 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   x->skip |= best_skip2;
   if (!is_inter_block(&best_mbmode)) {
     for (i = 0; i < 4; i++)
-      xd->mi_8x8[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
+      xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
   } else {
     for (i = 0; i < 4; ++i)
-      vpx_memcpy(&xd->mi_8x8[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
+      vpx_memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
 
-    mbmi->mv[0].as_int = xd->mi_8x8[0]->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = xd->mi_8x8[0]->bmi[3].as_mv[1].as_int;
+    mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
   }
 
   for (i = 0; i < REFERENCE_MODES; ++i) {
diff --git a/libvpx/vp9/encoder/vp9_rdopt.h b/libvpx/vp9/encoder/vp9_rdopt.h
index 6968fa6..a01dbd4 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/libvpx/vp9/encoder/vp9_rdopt.h
@@ -40,6 +40,8 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
                                   unsigned int qstep, int *rate,
                                   int64_t *dist);
 
+int vp9_get_switchable_rate(const MACROBLOCK *x);
+
 void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                             const TileInfo *const tile,
                             MV_REFERENCE_FRAME ref_frame,
diff --git a/libvpx/vp9/encoder/vp9_sad.c b/libvpx/vp9/encoder/vp9_sad.c
index 58c5df4..9d8da0d 100644
--- a/libvpx/vp9/encoder/vp9_sad.c
+++ b/libvpx/vp9/encoder/vp9_sad.c
@@ -44,7 +44,7 @@ unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, int src_stride, \
                                       const uint8_t *second_pred, \
                                       unsigned int max_sad) { \
   uint8_t comp_pred[m * n]; \
-  comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
+  vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
   return sad(src_ptr, src_stride, comp_pred, m, m, n); \
 }
 
diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c
index fd8fa53..9d3e6dc 100644
--- a/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/libvpx/vp9/encoder/vp9_segmentation.c
@@ -133,8 +133,8 @@ static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  xd->mi_8x8 = mi_8x8;
-  segment_id = xd->mi_8x8[0]->mbmi.segment_id;
+  xd->mi = mi_8x8;
+  segment_id = xd->mi[0]->mbmi.segment_id;
 
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
@@ -152,7 +152,7 @@ static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
 
     // Store the prediction status for this mb and update counts
     // as appropriate
-    xd->mi_8x8[0]->mbmi.seg_id_predicted = pred_flag;
+    xd->mi[0]->mbmi.seg_id_predicted = pred_flag;
     temporal_predictor_count[pred_context][pred_flag]++;
 
     if (!pred_flag)
@@ -169,7 +169,7 @@ static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile,
                           int mi_row, int mi_col,
                           BLOCK_SIZE bsize) {
   const VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mode_info_stride;
+  const int mis = cm->mi_stride;
   int bw, bh;
   const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
 
@@ -229,7 +229,7 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   vp9_prob t_pred_tree[SEG_TREE_PROBS];
   vp9_prob t_nopred_prob[PREDICTION_PROBS];
 
-  const int mis = cm->mode_info_stride;
+  const int mis = cm->mi_stride;
   MODE_INFO **mi_ptr, **mi;
 
   // Set default state for the segment tree probabilities and the
diff --git a/libvpx/vp9/encoder/vp9_speed_features.c b/libvpx/vp9/encoder/vp9_speed_features.c
new file mode 100644
index 0000000..d6b6174
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_speed_features.c
@@ -0,0 +1,394 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_speed_features.h"
+
+#define ALL_INTRA_MODES ((1 << DC_PRED) | \
+                         (1 << V_PRED) | (1 << H_PRED) | \
+                         (1 << D45_PRED) | (1 << D135_PRED) | \
+                         (1 << D117_PRED) | (1 << D153_PRED) | \
+                         (1 << D207_PRED) | (1 << D63_PRED) | \
+                         (1 << TM_PRED))
+#define INTRA_DC_ONLY   (1 << DC_PRED)
+#define INTRA_DC_TM     ((1 << TM_PRED) | (1 << DC_PRED))
+#define INTRA_DC_H_V    ((1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED))
+#define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED))
+
+// Masks for partially or completely disabling split mode
+#define DISABLE_ALL_INTER_SPLIT   ((1 << THR_COMP_GA) | \
+                                   (1 << THR_COMP_LA) | \
+                                   (1 << THR_ALTR) | \
+                                   (1 << THR_GOLD) | \
+                                   (1 << THR_LAST))
+
+#define DISABLE_ALL_SPLIT         ((1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT)
+
+#define DISABLE_COMPOUND_SPLIT    ((1 << THR_COMP_GA) | (1 << THR_COMP_LA))
+
+#define LAST_AND_INTRA_SPLIT_ONLY ((1 << THR_COMP_GA) | \
+                                   (1 << THR_COMP_LA) | \
+                                   (1 << THR_ALTR) | \
+                                   (1 << THR_GOLD))
+
+static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
+                                   SPEED_FEATURES *sf, int speed) {
+  sf->adaptive_rd_thresh = 1;
+  sf->recode_loop = (speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW;
+  sf->allow_skip_recode = 1;
+
+  if (speed >= 1) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->less_rectangular_check  = 1;
+    sf->tx_size_search_method = vp9_frame_is_boosted(cpi) ? USE_FULL_RD
+                                                          : USE_LARGESTALL;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    else
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->auto_mv_step_size = 1;
+    sf->adaptive_rd_thresh = 2;
+    sf->subpel_iters_per_step = 1;
+    sf->mode_skip_start = 10;
+    sf->adaptive_pred_interp_filter = 1;
+
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+  }
+
+  if (speed >= 2) {
+    sf->tx_size_search_method = vp9_frame_is_boosted(cpi) ? USE_FULL_RD
+                                                          : USE_LARGESTALL;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    else
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+
+    sf->adaptive_pred_interp_filter = 2;
+    sf->reference_masking = 1;
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+                                 FLAG_SKIP_INTRA_BESTINTER |
+                                 FLAG_SKIP_COMP_BESTINTRA |
+                                 FLAG_SKIP_INTRA_LOWVAR;
+    sf->disable_filter_search_var_thresh = 100;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
+    sf->adjust_partitioning_from_last_frame = 1;
+    sf->last_partitioning_redo_frequency = 3;
+  }
+
+  if (speed >= 3) {
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    else
+      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+
+    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+    sf->adaptive_rd_thresh = 3;
+    sf->mode_skip_start = 6;
+    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
+    sf->use_fast_coef_costing = 1;
+  }
+
+  if (speed >= 4) {
+    sf->use_square_partition_only = 1;
+    sf->tx_size_search_method = USE_LARGESTALL;
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    sf->adaptive_rd_thresh = 4;
+    sf->mode_search_skip_flags |= FLAG_SKIP_COMP_REFMISMATCH |
+                                  FLAG_EARLY_TERMINATE;
+    sf->disable_filter_search_var_thresh = 200;
+    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+    sf->use_lp32x32fdct = 1;
+  }
+
+  if (speed >= 5) {
+    int i;
+
+    sf->partition_search_type = FIXED_PARTITION;
+    sf->optimize_coefficients = 0;
+    sf->search_method = HEX;
+    sf->disable_filter_search_var_thresh = 500;
+    for (i = 0; i < TX_SIZES; ++i) {
+      sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
+      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+    }
+    cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
+  }
+}
+
+static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
+                                 int speed) {
+  sf->static_segmentation = 0;
+  sf->adaptive_rd_thresh = 1;
+  sf->encode_breakout_thresh = 1;
+  sf->use_fast_coef_costing = 1;
+
+  if (speed == 1) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->less_rectangular_check = 1;
+    sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
+                                                        : USE_LARGESTALL;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    else
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->adaptive_pred_interp_filter = 1;
+    sf->auto_mv_step_size = 1;
+    sf->adaptive_rd_thresh = 2;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->encode_breakout_thresh = 8;
+  }
+
+  if (speed >= 2) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->less_rectangular_check = 1;
+    sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
+                                                        : USE_LARGESTALL;
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = cm->show_frame ?
+        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+    else
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+                                 FLAG_SKIP_INTRA_BESTINTER |
+                                 FLAG_SKIP_COMP_BESTINTRA |
+                                 FLAG_SKIP_INTRA_LOWVAR;
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->adaptive_pred_interp_filter = 2;
+    sf->auto_mv_step_size = 1;
+    sf->reference_masking = 1;
+
+    sf->disable_filter_search_var_thresh = 50;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
+    sf->adjust_partitioning_from_last_frame = 1;
+    sf->last_partitioning_redo_frequency = 3;
+
+    sf->adaptive_rd_thresh = 2;
+    sf->use_lp32x32fdct = 1;
+    sf->mode_skip_start = 11;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->encode_breakout_thresh = 200;
+  }
+
+  if (speed >= 3) {
+    sf->use_square_partition_only = 1;
+    sf->disable_filter_search_var_thresh = 100;
+    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+    sf->constrain_copy_partition = 1;
+    sf->use_uv_intra_rd_estimate = 1;
+    sf->skip_encode_sb = 1;
+    sf->subpel_iters_per_step = 1;
+    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
+    sf->adaptive_rd_thresh = 4;
+    sf->mode_skip_start = 6;
+    sf->allow_skip_recode = 0;
+    sf->optimize_coefficients = 0;
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    sf->lpf_pick = LPF_PICK_FROM_Q;
+    sf->encode_breakout_thresh = 700;
+  }
+
+  if (speed >= 4) {
+    int i;
+    sf->last_partitioning_redo_frequency = 4;
+    sf->adaptive_rd_thresh = 5;
+    sf->use_fast_coef_costing = 0;
+    sf->auto_min_max_partition_size = STRICT_NEIGHBORING_MIN_MAX;
+    sf->adjust_partitioning_from_last_frame =
+        cm->last_frame_type != cm->frame_type || (0 ==
+        (cm->current_video_frame + 1) % sf->last_partitioning_redo_frequency);
+    sf->subpel_force_stop = 1;
+    for (i = 0; i < TX_SIZES; i++) {
+      sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
+      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+    }
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_ONLY;
+    sf->frame_parameter_update = 0;
+    sf->encode_breakout_thresh = 1000;
+    sf->search_method = FAST_HEX;
+    sf->disable_inter_mode_mask[BLOCK_32X32] = 1 << INTER_OFFSET(ZEROMV);
+    sf->disable_inter_mode_mask[BLOCK_32X64] = ~(1 << INTER_OFFSET(NEARESTMV));
+    sf->disable_inter_mode_mask[BLOCK_64X32] = ~(1 << INTER_OFFSET(NEARESTMV));
+    sf->disable_inter_mode_mask[BLOCK_64X64] = ~(1 << INTER_OFFSET(NEARESTMV));
+    sf->max_intra_bsize = BLOCK_32X32;
+    sf->allow_skip_recode = 1;
+  }
+
+  if (speed >= 5) {
+    sf->max_partition_size = BLOCK_32X32;
+    sf->min_partition_size = BLOCK_8X8;
+    sf->partition_check =
+        (cm->current_video_frame % sf->last_partitioning_redo_frequency == 1);
+    sf->force_frame_boost = cm->frame_type == KEY_FRAME ||
+        (cm->current_video_frame %
+            (sf->last_partitioning_redo_frequency << 1) == 1);
+    sf->max_delta_qindex = (cm->frame_type == KEY_FRAME) ? 20 : 15;
+    sf->partition_search_type = REFERENCE_PARTITION;
+    sf->use_nonrd_pick_mode = 1;
+    sf->search_method = FAST_DIAMOND;
+    sf->allow_skip_recode = 0;
+  }
+
+  if (speed >= 6) {
+    // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
+    sf->partition_search_type = SOURCE_VAR_BASED_PARTITION;
+    sf->search_type_check_frequency = 50;
+    sf->source_var_thresh = 360;
+
+    sf->use_nonrd_pick_mode = 1;
+    sf->search_method = FAST_DIAMOND;
+  }
+
+  if (speed >= 7) {
+    int i;
+    for (i = 0; i < BLOCK_SIZES; ++i)
+      sf->disable_inter_mode_mask[i] = ~(1 << INTER_OFFSET(NEARESTMV));
+  }
+}
+
+void vp9_set_speed_features(VP9_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  VP9_COMMON *const cm = &cpi->common;
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
+  const int speed = cpi->speed < 0 ? -cpi->speed : cpi->speed;
+  int i;
+
+  // best quality defaults
+  sf->frame_parameter_update = 1;
+  sf->search_method = NSTEP;
+  sf->recode_loop = ALLOW_RECODE;
+  sf->subpel_search_method = SUBPEL_TREE;
+  sf->subpel_iters_per_step = 2;
+  sf->subpel_force_stop = 0;
+  sf->optimize_coefficients = !oxcf->lossless;
+  sf->reduce_first_step_size = 0;
+  sf->auto_mv_step_size = 0;
+  sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+  sf->adaptive_rd_thresh = 0;
+  sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF;
+  sf->tx_size_search_method = USE_FULL_RD;
+  sf->use_lp32x32fdct = 0;
+  sf->adaptive_motion_search = 0;
+  sf->adaptive_pred_interp_filter = 0;
+  sf->reference_masking = 0;
+  sf->partition_search_type = SEARCH_PARTITION;
+  sf->less_rectangular_check = 0;
+  sf->use_square_partition_only = 0;
+  sf->auto_min_max_partition_size = NOT_IN_USE;
+  sf->max_partition_size = BLOCK_64X64;
+  sf->min_partition_size = BLOCK_4X4;
+  sf->adjust_partitioning_from_last_frame = 0;
+  sf->last_partitioning_redo_frequency = 4;
+  sf->constrain_copy_partition = 0;
+  sf->disable_split_mask = 0;
+  sf->mode_search_skip_flags = 0;
+  sf->force_frame_boost = 0;
+  sf->max_delta_qindex = 0;
+  sf->disable_split_var_thresh = 0;
+  sf->disable_filter_search_var_thresh = 0;
+  for (i = 0; i < TX_SIZES; i++) {
+    sf->intra_y_mode_mask[i] = ALL_INTRA_MODES;
+    sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES;
+  }
+  sf->use_rd_breakout = 0;
+  sf->skip_encode_sb = 0;
+  sf->use_uv_intra_rd_estimate = 0;
+  sf->allow_skip_recode = 0;
+  sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+  sf->use_fast_coef_updates = TWO_LOOP;
+  sf->use_fast_coef_costing = 0;
+  sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
+  sf->use_nonrd_pick_mode = 0;
+  sf->encode_breakout_thresh = 0;
+  for (i = 0; i < BLOCK_SIZES; ++i)
+    sf->disable_inter_mode_mask[i] = 0;
+  sf->max_intra_bsize = BLOCK_64X64;
+  // This setting only takes effect when partition_search_type is set
+  // to FIXED_PARTITION.
+  sf->always_this_block_size = BLOCK_16X16;
+  sf->search_type_check_frequency = 50;
+  sf->source_var_thresh = 100;
+
+  // Recode loop tolerence %.
+  sf->recode_tolerance = 25;
+
+  switch (oxcf->mode) {
+    case MODE_BESTQUALITY:
+    case MODE_SECONDPASS_BEST:  // This is the best quality mode.
+      cpi->diamond_search_sad = vp9_full_range_search;
+      break;
+    case MODE_FIRSTPASS:
+    case MODE_GOODQUALITY:
+    case MODE_SECONDPASS:
+      set_good_speed_feature(cpi, cm, sf, speed);
+      break;
+    case MODE_REALTIME:
+      set_rt_speed_feature(cm, sf, speed);
+      break;
+  }
+
+  // Slow quant, dct and trellis not worthwhile for first pass
+  // so make sure they are always turned off.
+  if (cpi->pass == 1)
+    sf->optimize_coefficients = 0;
+
+  // No recode for 1 pass.
+  if (cpi->pass == 0) {
+    sf->recode_loop = DISALLOW_RECODE;
+    sf->optimize_coefficients = 0;
+  }
+
+  if (sf->subpel_search_method == SUBPEL_TREE) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
+    cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_tree;
+  }
+
+  cpi->mb.optimize = sf->optimize_coefficients == 1 && cpi->pass != 1;
+
+  if (cpi->encode_breakout && oxcf->mode == MODE_REALTIME &&
+      sf->encode_breakout_thresh > cpi->encode_breakout)
+    cpi->encode_breakout = sf->encode_breakout_thresh;
+
+  if (sf->disable_split_mask == DISABLE_ALL_SPLIT)
+    sf->adaptive_pred_interp_filter = 0;
+
+  if (!cpi->oxcf.frame_periodic_boost) {
+    sf->max_delta_qindex = 0;
+  }
+}
diff --git a/libvpx/vp9/encoder/vp9_speed_features.h b/libvpx/vp9/encoder/vp9_speed_features.h
new file mode 100644
index 0000000..72f548a
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_speed_features.h
@@ -0,0 +1,359 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_SPEED_FEATURES_H_
+#define VP9_ENCODER_VP9_SPEED_FEATURES_H_
+
+#include "vp9/common/vp9_enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  DIAMOND = 0,
+  NSTEP = 1,
+  HEX = 2,
+  BIGDIA = 3,
+  SQUARE = 4,
+  FAST_HEX = 5,
+  FAST_DIAMOND = 6
+} SEARCH_METHODS;
+
+typedef enum {
+  // No recode.
+  DISALLOW_RECODE = 0,
+  // Allow recode for KF and exceeding maximum frame bandwidth.
+  ALLOW_RECODE_KFMAXBW = 1,
+  // Allow recode only for KF/ARF/GF frames.
+  ALLOW_RECODE_KFARFGF = 2,
+  // Allow recode for all frames based on bitrate constraints.
+  ALLOW_RECODE = 3,
+} RECODE_LOOP_TYPE;
+
+typedef enum {
+  SUBPEL_TREE = 0,
+  // Other methods to come
+} SUBPEL_SEARCH_METHODS;
+
+typedef enum {
+  LAST_FRAME_PARTITION_OFF = 0,
+  LAST_FRAME_PARTITION_LOW_MOTION = 1,
+  LAST_FRAME_PARTITION_ALL = 2
+} LAST_FRAME_PARTITION_METHOD;
+
+typedef enum {
+  USE_FULL_RD = 0,
+  USE_LARGESTINTRA,
+  USE_LARGESTINTRA_MODELINTER,
+  USE_LARGESTALL
+} TX_SIZE_SEARCH_METHOD;
+
+typedef enum {
+  NOT_IN_USE = 0,
+  RELAXED_NEIGHBORING_MIN_MAX = 1,
+  STRICT_NEIGHBORING_MIN_MAX = 2
+} AUTO_MIN_MAX_MODE;
+
+typedef enum {
+  // Try the full image with different values.
+  LPF_PICK_FROM_FULL_IMAGE,
+  // Try a small portion of the image with different values.
+  LPF_PICK_FROM_SUBIMAGE,
+  // Estimate the level based on quantizer and frame type
+  LPF_PICK_FROM_Q,
+} LPF_PICK_METHOD;
+
+typedef enum {
+  // Terminate search early based on distortion so far compared to
+  // qp step, distortion in the neighborhood of the frame, etc.
+  FLAG_EARLY_TERMINATE = 1 << 0,
+
+  // Skips comp inter modes if the best so far is an intra mode.
+  FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
+
+  // Skips comp inter modes if the best single intermode so far does
+  // not have the same reference as one of the two references being
+  // tested.
+  FLAG_SKIP_COMP_REFMISMATCH = 1 << 2,
+
+  // Skips oblique intra modes if the best so far is an inter mode.
+  FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
+
+  // Skips oblique intra modes  at angles 27, 63, 117, 153 if the best
+  // intra so far is not one of the neighboring directions.
+  FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
+
+  // Skips intra modes other than DC_PRED if the source variance is small
+  FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
+} MODE_SEARCH_SKIP_LOGIC;
+
+typedef enum {
+  // Search partitions using RD/NONRD criterion
+  SEARCH_PARTITION = 0,
+
+  // Always use a fixed size partition
+  FIXED_PARTITION = 1,
+
+  // Use a fixed size partition in every 64X64 SB, where the size is
+  // determined based on source variance
+  VAR_BASED_FIXED_PARTITION = 2,
+
+  REFERENCE_PARTITION = 3,
+
+  // Use an arbitrary partitioning scheme based on source variance within
+  // a 64X64 SB
+  VAR_BASED_PARTITION,
+
+  // Use non-fixed partitions based on source variance
+  SOURCE_VAR_BASED_PARTITION
+} PARTITION_SEARCH_TYPE;
+
+typedef enum {
+  // Does a dry run to see if any of the contexts need to be updated or not,
+  // before the final run.
+  TWO_LOOP = 0,
+
+  // No dry run conducted.
+  ONE_LOOP = 1,
+
+  // No dry run, also only half the coef contexts and bands are updated.
+  // The rest are not updated at all.
+  ONE_LOOP_REDUCED = 2
+} FAST_COEFF_UPDATE;
+
+typedef struct {
+  // Frame level coding parameter update
+  int frame_parameter_update;
+
+  // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
+  SEARCH_METHODS search_method;
+
+  RECODE_LOOP_TYPE recode_loop;
+
+  // Subpel_search_method can only be subpel_tree which does a subpixel
+  // logarithmic search that keeps stepping at 1/2 pixel units until
+  // you stop getting a gain, and then goes on to 1/4 and repeats
+  // the same process. Along the way it skips many diagonals.
+  SUBPEL_SEARCH_METHODS subpel_search_method;
+
+  // Maximum number of steps in logarithmic subpel search before giving up.
+  int subpel_iters_per_step;
+
+  // Control when to stop subpel search
+  int subpel_force_stop;
+
+  // This parameter controls the number of steps we'll do in a diamond
+  // search.
+  int max_step_search_steps;
+
+  // This parameter controls which step in the n-step process we start at.
+  // It's changed adaptively based on circumstances.
+  int reduce_first_step_size;
+
+  // If this is set to 1, we limit the motion search range to 2 times the
+  // largest motion vector found in the last frame.
+  int auto_mv_step_size;
+
+  // Trellis (dynamic programming) optimization of quantized values (+1, 0).
+  int optimize_coefficients;
+
+  // Always set to 0. If on it enables 0 cost background transmission
+  // (except for the initial transmission of the segmentation). The feature is
+  // disabled because the addition of very large block sizes make the
+  // backgrounds very to cheap to encode, and the segmentation we have
+  // adds overhead.
+  int static_segmentation;
+
+  // If 1 we iterate finding a best reference for 2 ref frames together - via
+  // a log search that iterates 4 times (check around mv for last for best
+  // error of combined predictor then check around mv for alt). If 0 we
+  // we just use the best motion vector found for each frame by itself.
+  int comp_inter_joint_search_thresh;
+
+  // This variable is used to cap the maximum number of times we skip testing a
+  // mode to be evaluated. A high value means we will be faster.
+  int adaptive_rd_thresh;
+
+  // Enables skipping the reconstruction step (idct, recon) in the
+  // intermediate steps assuming the last frame didn't have too many intra
+  // blocks and the q is less than a threshold.
+  int skip_encode_sb;
+  int skip_encode_frame;
+  // Speed feature to allow or disallow skipping of recode at block
+  // level within a frame.
+  int allow_skip_recode;
+
+  // This variable allows us to reuse the last frames partition choices
+  // (64x64 v 32x32 etc) for this frame. It can be set to only use the last
+  // frame as a starting point in low motion scenes or always use it. If set
+  // we use last partitioning_redo frequency to determine how often to redo
+  // the partitioning from scratch. Adjust_partitioning_from_last_frame
+  // enables us to adjust up or down one partitioning from the last frames
+  // partitioning.
+  LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
+
+  // Determine which method we use to determine transform size. We can choose
+  // between options like full rd, largest for prediction size, largest
+  // for intra and model coefs for the rest.
+  TX_SIZE_SEARCH_METHOD tx_size_search_method;
+
+  // Low precision 32x32 fdct keeps everything in 16 bits and thus is less
+  // precise but significantly faster than the non lp version.
+  int use_lp32x32fdct;
+
+  // TODO(JBB): remove this as its no longer used.
+
+  // After looking at the first set of modes (set by index here), skip
+  // checking modes for reference frames that don't match the reference frame
+  // of the best so far.
+  int mode_skip_start;
+
+  // TODO(JBB): Remove this.
+  int reference_masking;
+
+  PARTITION_SEARCH_TYPE partition_search_type;
+
+  // Used if partition_search_type = FIXED_SIZE_PARTITION
+  BLOCK_SIZE always_this_block_size;
+
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split.
+  int less_rectangular_check;
+
+  // Disable testing non square partitions. (eg 16x32)
+  int use_square_partition_only;
+
+  // Sets min and max partition sizes for this 64x64 region based on the
+  // same 64x64 in last encoded frame, and the left and above neighbor.
+  AUTO_MIN_MAX_MODE auto_min_max_partition_size;
+
+  // Min and max partition size we enable (block_size) as per auto
+  // min max, but also used by adjust partitioning, and pick_partitioning.
+  BLOCK_SIZE min_partition_size;
+  BLOCK_SIZE max_partition_size;
+
+  // Whether or not we allow partitions one smaller or one greater than the last
+  // frame's partitioning. Only used if use_lastframe_partitioning is set.
+  int adjust_partitioning_from_last_frame;
+
+  // How frequently we re do the partitioning from scratch. Only used if
+  // use_lastframe_partitioning is set.
+  int last_partitioning_redo_frequency;
+
+  // This enables constrained copy partitioning, which, given an input block
+  // size bsize, will copy previous partition for partitions less than bsize,
+  // otherwise bsize partition is used. bsize is currently set to 16x16.
+  // Used for the case where motion is detected in superblock.
+  int constrain_copy_partition;
+
+  // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
+  // it always, to allow it for only Last frame and Intra, disable it for all
+  // inter modes or to enable it always.
+  int disable_split_mask;
+
+  // TODO(jingning): combine the related motion search speed features
+  // This allows us to use motion search at other sizes as a starting
+  // point for this motion search and limits the search range around it.
+  int adaptive_motion_search;
+
+  // Allows sub 8x8 modes to use the prediction filter that was determined
+  // best for 8x8 mode. If set to 0 we always re check all the filters for
+  // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
+  // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
+  int adaptive_pred_interp_filter;
+
+  // Search through variable block partition types in non-RD mode decision
+  // encoding process for RTC.
+  int partition_check;
+
+  // Use finer quantizer in every other few frames that run variable block
+  // partition type search.
+  int force_frame_boost;
+
+  // Maximally allowed base quantization index fluctuation.
+  int max_delta_qindex;
+
+  // Implements various heuristics to skip searching modes
+  // The heuristics selected are based on  flags
+  // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+  unsigned int mode_search_skip_flags;
+
+  // A source variance threshold below which the split mode is disabled
+  unsigned int disable_split_var_thresh;
+
+  // A source variance threshold below which filter search is disabled
+  // Choose a very large value (UINT_MAX) to use 8-tap always
+  unsigned int disable_filter_search_var_thresh;
+
+  // These bit masks allow you to enable or disable intra modes for each
+  // transform size separately.
+  int intra_y_mode_mask[TX_SIZES];
+  int intra_uv_mode_mask[TX_SIZES];
+
+  // This variable enables an early break out of mode testing if the model for
+  // rd built from the prediction signal indicates a value that's much
+  // higher than the best rd we've seen so far.
+  int use_rd_breakout;
+
+  // This enables us to use an estimate for intra rd based on dc mode rather
+  // than choosing an actual uv mode in the stage of encoding before the actual
+  // final encode.
+  int use_uv_intra_rd_estimate;
+
+  // This feature controls how the loop filter level is determined.
+  LPF_PICK_METHOD lpf_pick;
+
+  // This feature limits the number of coefficients updates we actually do
+  // by only looking at counts from 1/2 the bands.
+  FAST_COEFF_UPDATE use_fast_coef_updates;
+
+  // This flag controls the use of non-RD mode decision.
+  int use_nonrd_pick_mode;
+
+  // This variable sets the encode_breakout threshold. Currently, it is only
+  // enabled in real time mode.
+  int encode_breakout_thresh;
+
+  // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
+  // modes are disabled in order from LSB to MSB for each BLOCK_SIZE.
+  int disable_inter_mode_mask[BLOCK_SIZES];
+
+  // This feature controls whether we do the expensive context update and
+  // calculation in the rd coefficient costing loop.
+  int use_fast_coef_costing;
+
+  // This feature controls the tolerence vs target used in deciding whether to
+  // recode a frame. It has no meaning if recode is disabled.
+  int recode_tolerance;
+
+  // This variable controls the maximum block size where intra blocks can be
+  // used in inter frames.
+  // TODO(aconverse): Fold this into one of the other many mode skips
+  BLOCK_SIZE max_intra_bsize;
+
+  // The frequency that we check if SOURCE_VAR_BASED_PARTITION or
+  // FIXED_PARTITION search type should be used.
+  int search_type_check_frequency;
+
+  // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
+  int source_var_thresh;
+} SPEED_FEATURES;
+
+struct VP9_COMP;
+
+void vp9_set_speed_features(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_SPEED_FEATURES_H_
+
diff --git a/libvpx/vp9/encoder/vp9_ssim.c b/libvpx/vp9/encoder/vp9_ssim.c
index 1435191..026e6a8 100644
--- a/libvpx/vp9/encoder/vp9_ssim.c
+++ b/libvpx/vp9/encoder/vp9_ssim.c
@@ -8,8 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp9_rtcd.h"
 
-#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_ssim.h"
 
 void vp9_ssim_parms_16x16_c(uint8_t *s, int sp, uint8_t *r,
                             int rp, unsigned long *sum_s, unsigned long *sum_r,
diff --git a/libvpx/vp9/encoder/vp9_ssim.h b/libvpx/vp9/encoder/vp9_ssim.h
new file mode 100644
index 0000000..a581c2c
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_ssim.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_SSIM_H_
+#define VP9_ENCODER_VP9_SSIM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vpx_scale/yv12config.h"
+
+double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+                     int lumamask, double *weight);
+
+double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+                      double *ssim_y, double *ssim_u, double *ssim_v);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_SSIM_H_
diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libvpx/vp9/encoder/vp9_svc_layercontext.c
index eba7bc6..c2b6263 100644
--- a/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -14,17 +14,26 @@
 #include "vp9/encoder/vp9_svc_layercontext.h"
 
 void vp9_init_layer_context(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
   const VP9_CONFIG *const oxcf = &cpi->oxcf;
-  int temporal_layer = 0;
-  cpi->svc.spatial_layer_id = 0;
-  cpi->svc.temporal_layer_id = 0;
-  for (temporal_layer = 0; temporal_layer < cpi->svc.number_temporal_layers;
-      ++temporal_layer) {
-    LAYER_CONTEXT *const lc = &cpi->svc.layer_context[temporal_layer];
+  int layer;
+  int layer_end;
+
+  svc->spatial_layer_id = 0;
+  svc->temporal_layer_id = 0;
+
+  if (svc->number_temporal_layers > 1) {
+    layer_end = svc->number_temporal_layers;
+  } else {
+    layer_end = svc->number_spatial_layers;
+  }
+
+  for (layer = 0; layer < layer_end; ++layer) {
+    LAYER_CONTEXT *const lc = &svc->layer_context[layer];
     RATE_CONTROL *const lrc = &lc->rc;
-    lrc->avg_frame_qindex[INTER_FRAME] = q_trans[oxcf->worst_allowed_q];
-    lrc->last_q[INTER_FRAME] = q_trans[oxcf->worst_allowed_q];
-    lrc->ni_av_qi = q_trans[oxcf->worst_allowed_q];
+    lc->current_video_frame_in_layer = 0;
+    lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+    lrc->ni_av_qi = oxcf->worst_allowed_q;
     lrc->total_actual_bits = 0;
     lrc->total_target_vs_actual = 0;
     lrc->ni_tot_qi = 0;
@@ -35,11 +44,19 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
     lrc->decimation_factor = 0;
     lrc->rate_correction_factor = 1.0;
     lrc->key_frame_rate_correction_factor = 1.0;
-    lc->target_bandwidth = oxcf->ts_target_bitrate[temporal_layer] *
-        1000;
-    lrc->buffer_level =
-        vp9_rescale((int)(oxcf->starting_buffer_level),
-                    lc->target_bandwidth, 1000);
+
+    if (svc->number_temporal_layers > 1) {
+      lc->target_bandwidth = oxcf->ts_target_bitrate[layer] * 1000;
+      lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+    } else {
+      lc->target_bandwidth = oxcf->ss_target_bitrate[layer] * 1000;
+      lrc->last_q[0] = oxcf->best_allowed_q;
+      lrc->last_q[1] = oxcf->best_allowed_q;
+      lrc->last_q[2] = oxcf->best_allowed_q;
+    }
+
+    lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level),
+                                    lc->target_bandwidth, 1000);
     lrc->bits_off_target = lrc->buffer_level;
   }
 }
@@ -47,16 +64,29 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
 // Update the layer context from a change_config() call.
 void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
                                             const int target_bandwidth) {
+  SVC *const svc = &cpi->svc;
   const VP9_CONFIG *const oxcf = &cpi->oxcf;
   const RATE_CONTROL *const rc = &cpi->rc;
-  int temporal_layer = 0;
+  int layer;
+  int layer_end;
   float bitrate_alloc = 1.0;
-  for (temporal_layer = 0; temporal_layer < cpi->svc.number_temporal_layers;
-      ++temporal_layer) {
-    LAYER_CONTEXT *const lc = &cpi->svc.layer_context[temporal_layer];
+
+  if (svc->number_temporal_layers > 1) {
+    layer_end = svc->number_temporal_layers;
+  } else {
+    layer_end = svc->number_spatial_layers;
+  }
+
+  for (layer = 0; layer < layer_end; ++layer) {
+    LAYER_CONTEXT *const lc = &svc->layer_context[layer];
     RATE_CONTROL *const lrc = &lc->rc;
-    lc->target_bandwidth = oxcf->ts_target_bitrate[temporal_layer] * 1000;
-    bitrate_alloc = (float)lc->target_bandwidth / (float)target_bandwidth;
+
+    if (svc->number_temporal_layers > 1) {
+      lc->target_bandwidth = oxcf->ts_target_bitrate[layer] * 1000;
+    } else {
+      lc->target_bandwidth = oxcf->ss_target_bitrate[layer] * 1000;
+    }
+    bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
     // Update buffer-related quantities.
     lc->starting_buffer_level =
         (int64_t)(oxcf->starting_buffer_level * bitrate_alloc);
@@ -67,7 +97,11 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
     lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
     lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size);
     // Update framerate-related quantities.
-    lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[temporal_layer];
+    if (svc->number_temporal_layers > 1) {
+      lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
+    } else {
+      lc->framerate = oxcf->framerate;
+    }
     lrc->av_per_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
     lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
     // Update qp-related quantities.
@@ -76,34 +110,70 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
   }
 }
 
-void vp9_update_layer_framerate(VP9_COMP *const cpi) {
-  int temporal_layer = cpi->svc.temporal_layer_id;
+static LAYER_CONTEXT *get_layer_context(SVC *svc) {
+  return svc->number_temporal_layers > 1 ?
+         &svc->layer_context[svc->temporal_layer_id] :
+         &svc->layer_context[svc->spatial_layer_id];
+}
+
+void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
   const VP9_CONFIG *const oxcf = &cpi->oxcf;
-  LAYER_CONTEXT *const lc = &cpi->svc.layer_context[temporal_layer];
+  LAYER_CONTEXT *const lc = get_layer_context(svc);
   RATE_CONTROL *const lrc = &lc->rc;
-  lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[temporal_layer];
+  const int layer = svc->temporal_layer_id;
+
+  lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
   lrc->av_per_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
   lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
   // Update the average layer frame size (non-cumulative per-frame-bw).
-  if (temporal_layer == 0) {
+  if (layer == 0) {
     lc->avg_frame_size = lrc->av_per_frame_bandwidth;
   } else {
-    double prev_layer_framerate = oxcf->framerate /
-        oxcf->ts_rate_decimator[temporal_layer - 1];
-    int prev_layer_target_bandwidth =
-        oxcf->ts_target_bitrate[temporal_layer - 1] * 1000;
+    const double prev_layer_framerate =
+        oxcf->framerate / oxcf->ts_rate_decimator[layer - 1];
+    const int prev_layer_target_bandwidth =
+        oxcf->ts_target_bitrate[layer - 1] * 1000;
     lc->avg_frame_size =
         (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
               (lc->framerate - prev_layer_framerate));
   }
 }
 
+void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
+  LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+  RATE_CONTROL *const lrc = &lc->rc;
+
+  lc->framerate = framerate;
+  lrc->av_per_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->min_frame_bandwidth = (int)(lrc->av_per_frame_bandwidth *
+                                   oxcf->two_pass_vbrmin_section / 100);
+  lrc->max_frame_bandwidth = (int)(((int64_t)lrc->av_per_frame_bandwidth *
+                                   oxcf->two_pass_vbrmax_section) / 100);
+  lrc->max_gf_interval = 16;
+
+  lrc->static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;
+
+  if (oxcf->play_alternate && oxcf->lag_in_frames) {
+    if (lrc->max_gf_interval > oxcf->lag_in_frames - 1)
+      lrc->max_gf_interval = oxcf->lag_in_frames - 1;
+
+    if (lrc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+      lrc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+  }
+
+  if (lrc->max_gf_interval > lrc->static_scene_max_gf_interval)
+    lrc->max_gf_interval = lrc->static_scene_max_gf_interval;
+}
+
 void vp9_restore_layer_context(VP9_COMP *const cpi) {
-  int temporal_layer = cpi->svc.temporal_layer_id;
-  LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer];
-  int frame_since_key = cpi->rc.frames_since_key;
-  int frame_to_key = cpi->rc.frames_to_key;
+  LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+  const int old_frame_since_key = cpi->rc.frames_since_key;
+  const int old_frame_to_key = cpi->rc.frames_to_key;
+
   cpi->rc = lc->rc;
+  cpi->twopass = lc->twopass;
   cpi->oxcf.target_bandwidth = lc->target_bandwidth;
   cpi->oxcf.starting_buffer_level = lc->starting_buffer_level;
   cpi->oxcf.optimal_buffer_level = lc->optimal_buffer_level;
@@ -111,17 +181,44 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
   cpi->output_framerate = lc->framerate;
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
-  cpi->rc.frames_since_key = frame_since_key;
-  cpi->rc.frames_to_key = frame_to_key;
+  if (cpi->svc.number_temporal_layers > 1) {
+    cpi->rc.frames_since_key = old_frame_since_key;
+    cpi->rc.frames_to_key = old_frame_to_key;
+  }
 }
 
 void vp9_save_layer_context(VP9_COMP *const cpi) {
-  int temporal_layer = cpi->svc.temporal_layer_id;
-  LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer];
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
+  LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+
   lc->rc = cpi->rc;
-  lc->target_bandwidth = (int)cpi->oxcf.target_bandwidth;
-  lc->starting_buffer_level = cpi->oxcf.starting_buffer_level;
-  lc->optimal_buffer_level = cpi->oxcf.optimal_buffer_level;
-  lc->maximum_buffer_size = cpi->oxcf.maximum_buffer_size;
+  lc->twopass = cpi->twopass;
+  lc->target_bandwidth = (int)oxcf->target_bandwidth;
+  lc->starting_buffer_level = oxcf->starting_buffer_level;
+  lc->optimal_buffer_level = oxcf->optimal_buffer_level;
+  lc->maximum_buffer_size = oxcf->maximum_buffer_size;
   lc->framerate = cpi->output_framerate;
 }
+
+void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
+  SVC *const svc = &cpi->svc;
+  int i;
+
+  for (i = 0; i < svc->number_spatial_layers; ++i) {
+    struct twopass_rc *const twopass = &svc->layer_context[i].twopass;
+
+    svc->spatial_layer_id = i;
+    vp9_init_second_pass(cpi);
+
+    twopass->total_stats.spatial_layer_id = i;
+    twopass->total_left_stats.spatial_layer_id = i;
+  }
+  svc->spatial_layer_id = 0;
+}
+
+void vp9_inc_frame_in_layer(SVC *svc) {
+  LAYER_CONTEXT *const lc = (svc->number_temporal_layers > 1)
+      ? &svc->layer_context[svc->temporal_layer_id]
+      : &svc->layer_context[svc->spatial_layer_id];
+  ++lc->current_video_frame_in_layer;
+}
diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libvpx/vp9/encoder/vp9_svc_layercontext.h
index e81b0b7..2abed30 100644
--- a/libvpx/vp9/encoder/vp9_svc_layercontext.h
+++ b/libvpx/vp9/encoder/vp9_svc_layercontext.h
@@ -28,6 +28,8 @@ typedef struct {
   double framerate;
   int avg_frame_size;
   struct twopass_rc twopass;
+  struct vpx_fixed_buf rc_twopass_stats_in;
+  unsigned int current_video_frame_in_layer;
 } LAYER_CONTEXT;
 
 typedef struct {
@@ -35,8 +37,8 @@ typedef struct {
   int temporal_layer_id;
   int number_spatial_layers;
   int number_temporal_layers;
-  // Layer context used for rate control in temporal CBR mode or spatial
-  // two pass mode. Defined for temporal or spatial layers for now.
+  // Layer context used for rate control in one pass temporal CBR mode or
+  // two pass spatial mode. Defined for temporal or spatial layers for now.
   // Does not support temporal combined with spatial RC.
   LAYER_CONTEXT layer_context[MAX(VPX_TS_MAX_LAYERS, VPX_SS_MAX_LAYERS)];
 } SVC;
@@ -51,8 +53,12 @@ void vp9_update_layer_context_change_config(struct VP9_COMP *const cpi,
                                             const int target_bandwidth);
 
 // Prior to encoding the frame, update framerate-related quantities
-// for the current layer.
-void vp9_update_layer_framerate(struct VP9_COMP *const cpi);
+// for the current temporal layer.
+void vp9_update_temporal_layer_framerate(struct VP9_COMP *const cpi);
+
+// Update framerate-related quantities for the current spatial layer.
+void vp9_update_spatial_layer_framerate(struct VP9_COMP *const cpi,
+                                        double framerate);
 
 // Prior to encoding the frame, set the layer context, for the current layer
 // to be encoded, to the cpi struct.
@@ -61,6 +67,12 @@ void vp9_restore_layer_context(struct VP9_COMP *const cpi);
 // Save the layer context after encoding the frame.
 void vp9_save_layer_context(struct VP9_COMP *const cpi);
 
+// Initialize second pass rc for spatial svc.
+void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);
+
+// Increment number of video frames in layer
+void vp9_inc_frame_in_layer(SVC *svc);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c
index 6233116..0410273 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -41,7 +41,10 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                             struct scale_factors *scale,
                                             int x, int y) {
   const int which_mv = 0;
-  MV mv = { mv_row, mv_col };
+  const MV mv = { mv_row, mv_col };
+  const InterpKernel *const kernel =
+    vp9_get_interp_kernel(xd->mi[0]->mbmi.interp_filter);
+
   enum mv_precision mv_precision_uv;
   int uv_stride;
   if (uv_block_size == 8) {
@@ -58,7 +61,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                             scale,
                             16, 16,
                             which_mv,
-                            xd->interp_kernel, MV_PRECISION_Q3, x, y);
+                            kernel, MV_PRECISION_Q3, x, y);
 
   vp9_build_inter_predictor(u_mb_ptr, uv_stride,
                             &pred[256], uv_block_size,
@@ -66,7 +69,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                             scale,
                             uv_block_size, uv_block_size,
                             which_mv,
-                            xd->interp_kernel, mv_precision_uv, x, y);
+                            kernel, mv_precision_uv, x, y);
 
   vp9_build_inter_predictor(v_mb_ptr, uv_stride,
                             &pred[512], uv_block_size,
@@ -74,7 +77,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                             scale,
                             uv_block_size, uv_block_size,
                             which_mv,
-                            xd->interp_kernel, mv_precision_uv, x, y);
+                            kernel, mv_precision_uv, x, y);
 }
 
 void vp9_temporal_filter_apply_c(uint8_t *frame1,
@@ -133,7 +136,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
 
   MV best_ref_mv1 = {0, 0};
   MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-  MV *ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0].as_mv;
+  MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv;
 
   // Save input state
   struct buf_2d src = x->plane[0].src;
@@ -250,8 +253,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
         if (cpi->frames[frame] == NULL)
           continue;
 
-        mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row = 0;
-        mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col = 0;
+        mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
+        mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0;
 
         if (frame == alt_ref_index) {
           filter_weight = 2;
@@ -284,8 +287,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
            cpi->frames[frame]->v_buffer + mb_uv_offset,
            cpi->frames[frame]->y_stride,
            mb_uv_height,
-           mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row,
-           mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col,
+           mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
+           mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
            predictor, scale,
            mb_col * 16, mb_row * 16);
 
diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c
index bb5f1c2..291ccb3 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/libvpx/vp9/encoder/vp9_tokenize.c
@@ -108,7 +108,7 @@ void vp9_coef_tree_initialize() {
   vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
 }
 
-static void fill_value_tokens() {
+void vp9_tokenize_initialize() {
   TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
   const vp9_extra_bit *const e = vp9_extra_bits;
 
@@ -162,7 +162,6 @@ struct tokenize_b_args {
   VP9_COMP *cpi;
   MACROBLOCKD *xd;
   TOKENEXTRA **tp;
-  uint8_t *token_cache;
 };
 
 static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -213,10 +212,10 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   VP9_COMP *cpi = args->cpi;
   MACROBLOCKD *xd = args->xd;
   TOKENEXTRA **tp = args->tp;
-  uint8_t *token_cache = args->token_cache;
+  uint8_t token_cache[32 * 32];
   struct macroblock_plane *p = &cpi->mb.plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   int pt; /* near block/prev token context index */
   int c;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
@@ -310,12 +309,12 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
                      BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   TOKENEXTRA *t_backup = *t;
   const int ctx = vp9_get_skip_context(xd);
   const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
-  struct tokenize_b_args arg = {cpi, xd, t, cpi->mb.token_cache};
+  struct tokenize_b_args arg = {cpi, xd, t};
   if (mbmi->skip) {
     if (!dry_run)
       cm->counts.skip[ctx][1] += skip_inc;
@@ -333,7 +332,3 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
     *t = t_backup;
   }
 }
-
-void vp9_tokenize_initialize() {
-  fill_value_tokens();
-}
diff --git a/libvpx/vp9/encoder/vp9_variance.c b/libvpx/vp9/encoder/vp9_variance.c
index 8bc3850..71867a9 100644
--- a/libvpx/vp9/encoder/vp9_variance.c
+++ b/libvpx/vp9/encoder/vp9_variance.c
@@ -216,7 +216,7 @@ unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 33, 64, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
-  comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
+  vp9_comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
   return vp9_variance64x32(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -273,7 +273,7 @@ unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 65, 32, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
-  comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
+  vp9_comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
   return vp9_variance32x64(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -330,7 +330,7 @@ unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 17, 32, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
-  comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
+  vp9_comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
   return vp9_variance32x16(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -387,7 +387,7 @@ unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 33, 16, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
-  comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
+  vp9_comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
   return vp9_variance16x32(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -417,6 +417,12 @@ unsigned int vp9_variance32x32_c(const uint8_t *src_ptr,
   return (var - (((int64_t)avg * avg) >> 10));
 }
 
+void vp9_get_sse_sum_16x16_c(const uint8_t *src_ptr, int source_stride,
+                             const uint8_t *ref_ptr, int ref_stride,
+                             unsigned int *sse, int *sum) {
+  variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
+}
+
 unsigned int vp9_variance16x16_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -614,7 +620,7 @@ unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
 
   // Now filter Verticaly
   var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
-  comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
+  vp9_comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
   return vp9_variance4x4(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -658,7 +664,7 @@ unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 9, 8, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
-  comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
+  vp9_comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
   return vp9_variance8x8(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -703,7 +709,7 @@ unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
                                     1, 17, 16, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
 
-  comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
+  vp9_comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
   return vp9_variance16x16(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -747,7 +753,7 @@ unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 65, 64, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
-  comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
+  vp9_comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
   return vp9_variance64x64(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -791,7 +797,7 @@ unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 33, 32, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
-  comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
+  vp9_comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
   return vp9_variance32x32(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -955,7 +961,7 @@ unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 9, 16, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
-  comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
+  vp9_comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
   return vp9_variance16x8(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -999,7 +1005,7 @@ unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 17, 8, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
-  comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
+  vp9_comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
   return vp9_variance8x16(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -1043,7 +1049,7 @@ unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr,
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 5, 8, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);
-  comp_avg_pred(temp3, second_pred, 8, 4, temp2, 8);
+  vp9_comp_avg_pred(temp3, second_pred, 8, 4, temp2, 8);
   return vp9_variance8x4(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -1089,6 +1095,23 @@ unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr,
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 9, 4, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);
-  comp_avg_pred(temp3, second_pred, 4, 8, temp2, 4);
+  vp9_comp_avg_pred(temp3, second_pred, 4, 8, temp2, 4);
   return vp9_variance4x8(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
 }
+
+
+void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
+                       int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp;
+      tmp = pred[j] + ref[j];
+      comp_pred[j] = (tmp + 1) >> 1;
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
diff --git a/libvpx/vp9/encoder/vp9_variance.h b/libvpx/vp9/encoder/vp9_variance.h
index 3bc2091..62e20dc 100644
--- a/libvpx/vp9/encoder/vp9_variance.h
+++ b/libvpx/vp9/encoder/vp9_variance.h
@@ -100,21 +100,9 @@ typedef struct vp9_variance_vtable {
   vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
 
-static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                          int height, const uint8_t *ref, int ref_stride) {
-  int i, j;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      int tmp;
-      tmp = pred[j] + ref[j];
-      comp_pred[j] = (tmp + 1) >> 1;
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
+void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
+                       int height, const uint8_t *ref, int ref_stride);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vp9/encoder/vp9_write_bit_buffer.c b/libvpx/vp9/encoder/vp9_write_bit_buffer.c
new file mode 100644
index 0000000..962d0ca
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_write_bit_buffer.c
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_write_bit_buffer.h"
+
+size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) {
+  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
+  const int off = (int)wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  if (q == CHAR_BIT -1) {
+    wb->bit_buffer[p] = bit << q;
+  } else {
+    wb->bit_buffer[p] &= ~(1 << q);
+    wb->bit_buffer[p] |= bit << q;
+  }
+  wb->bit_offset = off + 1;
+}
+
+void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits) {
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    vp9_wb_write_bit(wb, (data >> bit) & 1);
+}
diff --git a/libvpx/vp9/encoder/vp9_write_bit_buffer.h b/libvpx/vp9/encoder/vp9_write_bit_buffer.h
index 1795e05..073608d 100644
--- a/libvpx/vp9/encoder/vp9_write_bit_buffer.h
+++ b/libvpx/vp9/encoder/vp9_write_bit_buffer.h
@@ -24,29 +24,11 @@ struct vp9_write_bit_buffer {
   size_t bit_offset;
 };
 
-static size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) {
-  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
-}
-
-static void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
-  const int off = (int)wb->bit_offset;
-  const int p = off / CHAR_BIT;
-  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
-  if (q == CHAR_BIT -1) {
-    wb->bit_buffer[p] = bit << q;
-  } else {
-    wb->bit_buffer[p] &= ~(1 << q);
-    wb->bit_buffer[p] |= bit << q;
-  }
-  wb->bit_offset = off + 1;
-}
-
-static void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb,
-                              int data, int bits) {
-  int bit;
-  for (bit = bits - 1; bit >= 0; bit--)
-    vp9_wb_write_bit(wb, (data >> bit) & 1);
-}
+size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb);
+
+void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit);
+
+void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits);
 
 
 #ifdef __cplusplus
diff --git a/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c b/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
new file mode 100644
index 0000000..f31b176
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
@@ -0,0 +1,167 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>  // AVX2
+#include "vpx/vpx_integer.h"
+
+void vp9_sad32x32x4d_avx2(uint8_t *src,
+                          int src_stride,
+                          uint8_t *ref[4],
+                          int ref_stride,
+                          unsigned int res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  __m256i sum_mlow, sum_mhigh;
+  int i;
+  uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm256_set1_epi16(0);
+  sum_ref1 = _mm256_set1_epi16(0);
+  sum_ref2 = _mm256_set1_epi16(0);
+  sum_ref3 = _mm256_set1_epi16(0);
+  for (i = 0; i < 32 ; i++) {
+    // load src and all refs
+    src_reg = _mm256_load_si256((__m256i *)(src));
+    ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
+    ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
+    ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
+    ref3_reg = _mm256_loadu_si256((__m256i *) (ref3));
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+
+    src+= src_stride;
+    ref0+= ref_stride;
+    ref1+= ref_stride;
+    ref2+= ref_stride;
+    ref3+= ref_stride;
+  }
+  {
+    __m128i sum;
+    // in sum_ref-i the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
+    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
+    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref-i
+    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
+                        _mm256_extractf128_si256(sum_mlow, 1));
+
+    _mm_storeu_si128((__m128i *)(res), sum);
+  }
+}
+
+void vp9_sad64x64x4d_avx2(uint8_t *src,
+                          int src_stride,
+                          uint8_t *ref[4],
+                          int ref_stride,
+                          unsigned int res[4]) {
+  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
+  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
+  __m256i ref3_reg, ref3next_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  __m256i sum_mlow, sum_mhigh;
+  int i;
+  uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm256_set1_epi16(0);
+  sum_ref1 = _mm256_set1_epi16(0);
+  sum_ref2 = _mm256_set1_epi16(0);
+  sum_ref3 = _mm256_set1_epi16(0);
+  for (i = 0; i < 64 ; i++) {
+    // load 64 bytes from src and all refs
+    src_reg = _mm256_load_si256((__m256i *)(src));
+    srcnext_reg = _mm256_load_si256((__m256i *)(src + 32));
+    ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
+    ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32));
+    ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
+    ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32));
+    ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
+    ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32));
+    ref3_reg = _mm256_loadu_si256((__m256i *) (ref3));
+    ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32));
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
+    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
+    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
+    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
+
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
+    src+= src_stride;
+    ref0+= ref_stride;
+    ref1+= ref_stride;
+    ref2+= ref_stride;
+    ref3+= ref_stride;
+  }
+  {
+    __m128i sum;
+
+    // in sum_ref-i the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
+    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
+    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref-i
+    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
+                        _mm256_extractf128_si256(sum_mlow, 1));
+
+    _mm_storeu_si128((__m128i *)(res), sum);
+  }
+}